# Prospecção de Dados (Data Mining) DI/FCUL - Project

## Project (MC/DI/FCUL - 2024)

### GROUP:`09`

* Afonso Gama, 55857 - x Hours
* Eduardo Carneiro, 62515 - 4 Hours
* Guilherme Rosario, 62543 - x Hours
* Marco Viana, 62550 - x Hours

# TODO list:
- Compare which molecules are similar based on the fingerprints
    - Understand how the the similar ones can be used to aid the prediction


- Test more models
    - Current models have bad validation scores


In [1]:
import pickle

# Load the fingerprints dataset
with open('../data/mol_bits.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
print(type(data))
print(data)

---

In [2]:
from scipy.sparse import dok_matrix
import numpy as np

# Get sparse matrix size from the dataset
height = len(data.keys())
width = max( [max( list(data.values())[i] ) for i in range(height)] )

print(f"Height: {height}, Width: {width}")

# Lookup dictionaries for easier and fast access
mol_name_lookup = {mol_id: idx for idx, mol_id in enumerate(data.keys())}
mol_ids_lookup = {idx: mol_id for idx, mol_id in enumerate(data.keys())}

# Create the sparse matrix
mat = dok_matrix( (height,width), dtype=np.int32)

# Fill the matrix
for uniprot_idx, (_, struct_ids_list) in zip(range(height), data.items() ):
    for struct_id in struct_ids_list:

        # Indexs are starting from 1 in the dataset
        mat[uniprot_idx, struct_id-1 ] = 1

one_hot_arr = mat.toarray()

Height: 73865, Width: 2047


In [3]:
import matplotlib.pyplot as plt

# Code from TP05
def DrawSimPlot(B, R):
    S=np.arange(0,1.0,.01)
    v=(1/B)**(1/R)
    P=1-(1-S**R)**B
    plt.figure(figsize=(7,5))
    plt.plot(S,P)
    plt.axvline(x = v, linestyle ="--", color ='r')
    plt.title("Candidate pairs probability for B=%d and R=%d" % (B,R))
    plt.xlabel("Document Similarity")
    plt.ylabel("Probability of being a candidate pair")

    plt.grid()
    plt.show()

# Code from TP05
# THIS IS MODIFIED TO WORK WITH OUR DATA STRUCTURE
def MakeBucketsT(TDocs, perms, N,M, B, R, NB):
    Buckets={}
    all_docs=set(range(N))
    for b in range(B):
        SIGS=np.zeros((N, R), dtype="int32")           # initializes line sig
        for r in range(R):
            perm=perms[b*R+r]
            L=all_docs.copy()                         # gets all docs as a set
            i=0 
            while len(L)>0:
                elem=perm[i]                          # get new element  from permutation
                docs_found=TDocs[elem] & L            # get all the docs with a set bit on that elem that are still on the list
                if len(docs_found)>0:                 # if anything was found
                    SIGS[list(docs_found), r]=i       #   set the line sig to the current position from the perm
                    L=L-docs_found                    #   update the current list removing the found docs
                i+=1                                  # update the current position
                if i==M:                              #this is the case that the document is empty 
                    SIGS[list(L), r]=i                # Highly unlikely in a real data set  
                    L={}
                                                      # we have completed the signature for a given band, 
                                                      # now make the hashes for each document
        for d in range(N):
            bucket = hash(tuple(SIGS[d])) % NB
            Buckets.setdefault((b, bucket),set()).add(d)
    return Buckets

# Code from TP05
# THIS IS MODIFIED TO WORK WITH OUR DATA STRUCTURE
def LSHT(Data, B, R, N,M, NB=28934501 ):
    #transpose the data set
    
    DT=list(Data.values())

    DataT=[set(DT[i]) for i in range(M)]
    P=B*R
    np.random.seed(3)
    #print("Generating %d permutations for %6.3f similarity" %(P, (1/B)**(1/R)))
    perms=[np.random.permutation(M) for i in range(P)]
    buckets=MakeBucketsT(DataT, perms, N,M, B,R, NB)
    return buckets

def setify_similarity_results(band, current_dict, list_similar_names):

    for name in list_similar_names:
        for similar_name in list_similar_names:
            
            if similar_name != name:
                if name not in current_dict:
                    current_dict[name] = {band: set()} 
                
                if band not in current_dict[name]:
                    current_dict[name][band] = set()
                    
                current_dict[name][band].add(similar_name)

def JaccardSim(d1, d2):
    a =np.inner(d1,d2)
    bc=np.sum(d1+d2)-a
    return a/bc

In [4]:
s=0.7

for band_num in range(500, 2001, 500):
    for row_num in range(2, 3):

        p = 1-(1-0.8**row_num)**band_num

        bucks = LSHT(data, band_num, row_num , M = height, N = width)

        # This dict will have a lot of redundancy, but it's easier to search for similar documents of a given document
        results_dict = {}

        for b, buck in bucks:
            if len(bucks[(b,buck)])>1:

                # get_names_from_ids = list(map(lambda x: protein_ids_lookup[x], bucks[(b,buck)]))

                # print("Band", b, "suggests these similar docs:", bucks[(b,buck)])

                setify_similarity_results(b, results_dict, bucks[(b,buck)])

        # ---

        # Check with Jaccard Sim
        sim_above_threshold = []

        for b, buck in bucks:

            if len(bucks[(b,buck)])>1:
                doc_ids=np.array(list(bucks[(b,buck)]))

                idx = np.stack(np.triu_indices(len(doc_ids), k=1), axis=-1)
                sim_pairs=doc_ids[idx]

                for d1, d2 in sim_pairs:
                    J=JaccardSim(one_hot_arr[d1], one_hot_arr[d2])


                    # print ("Jaccard Similarity between docs %d (%s) and %d (%s) is: %7.4f" %(d1, protein_ids_lookup[d1] , d2, protein_ids_lookup[d2] ,J), end="")
                    
                    if J > s: 
                        # print("  <-- Similar")
                        sim_above_threshold.append((d1, d2, J))

        sim_above_threshold = sorted( set(sim_above_threshold), key=lambda x: x[2], reverse=True)

        # Print Jacc Sim above threshold
        print(f"Band: {band_num}, R: {row_num}, P {p} | Mean Jaccard Similarity: {np.mean([x[2] for x in sim_above_threshold])} | Number of similar pairs: {len(sim_above_threshold)}")
        # for d1, d2, sim in sim_above_threshold:
        #     print(mol_ids_lookup[d1], mol_ids_lookup[d2], sim)
            # print(d1,d2, sim)

Band: 500, R: 2 | Mean Jaccard Similarity: 0.7649499589860695 | Number of similar pairs: 366
Band: 1000, R: 2 | Mean Jaccard Similarity: 0.7657525836915396 | Number of similar pairs: 587
Band: 1500, R: 2 | Mean Jaccard Similarity: 0.7661223750677751 | Number of similar pairs: 792
Band: 2000, R: 2 | Mean Jaccard Similarity: 0.7656900322731491 | Number of similar pairs: 948


In [None]:
# REMOVE THIS CODE?

def JaccardSim(d1, d2):
    a =np.inner(d1,d2)
    bc=np.sum(d1+d2)-a
    return a/bc

import time
mol_similar = {}

# This would be cool but takes too long to run, we need to try another approach
for idx, (mol, ids) in enumerate(data.items()):
    
    # if idx % 1500 == 0:
    print(f"Processed {idx} molecules")

    mol_struct_size = len(ids)
    
    # Check which molecules are similar to the current molecule
    for mol2, ids2 in data.items():
        
        if mol != mol2:

            intersection = np.intersect1d(ids, ids2).shape[0]
            sim_ratio = intersection / mol_struct_size
            sim_jac = intersection /  ( len(ids) + len(ids2) - intersection)

            # if idx % 1500 == 0:
            #     print(f"Similarity ratio: {sim_ratio}, Jaccard similarity: {sim_jac}")

            if sim_ratio >= 0.75 or sim_jac >= 0.75:

                if mol not in mol_similar:
                    mol_similar[mol] = [mol2]
                else:
                    mol_similar[mol].append(mol2)


---

In [2]:
# Load the data from the csvs
import pandas as pd
import numpy as np
import random

random.seed(42)

df_train_original = pd.read_csv("../data/activity_train.csv", header=None)
df_test = pd.read_csv("../data/activity_test_blanked.csv", header=None)

# Split train into train and validation
train_idxs = random.sample(range(0, len(df_train_original)), int(len(df_train_original) * 0.9) )

df_train = df_train_original.iloc[train_idxs]
df_val = df_train_original.drop(train_idxs)


In [3]:
# Pre-Processing
from sklearn.preprocessing import LabelEncoder

# Train
# Get the input arrays from the df
X_train_labels_x1 = df_train.iloc[:,0].to_numpy()
X_train_labels_x2 = df_train.iloc[:,1].to_numpy()

# Get unique values in x1 and convert them into integers
label_enc_x1 = LabelEncoder()
label_enc_x1.fit(X_train_labels_x1)
X_train_transformed_x1 = label_enc_x1.transform(X_train_labels_x1)

# Get unique values in x2 and convert them into integers
label_enc_x2 = LabelEncoder()
label_enc_x2.fit(X_train_labels_x2)
X_train_transformed_x2 = label_enc_x2.transform(X_train_labels_x2)

# ---

# Validation
# Get the input arrays from the df
X_val_labels_x1 = df_val.iloc[:,0].to_numpy()
X_val_labels_x2 = df_val.iloc[:,1].to_numpy()

# Convert values into integers
X_val_transformed_x1 = label_enc_x1.transform(X_val_labels_x1)

# Get unique values in x2 and convert them into integers
label_enc_x2_val = LabelEncoder()
label_enc_x2_val.fit(X_val_labels_x2)
X_val_transformed_x2 = label_enc_x2_val.transform(X_val_labels_x2)


# ---
# Test
# Get the input arrays from the df
X_test_labels_x1 = df_test.iloc[:,0].to_numpy()
X_test_labels_x2 = df_test.iloc[:,1].to_numpy()

# Convert values into integers
X_test_transformed_x1 = label_enc_x1.transform(X_test_labels_x1)

# Get unique values in x2 and convert them into integers
label_enc_x2_test = LabelEncoder()
label_enc_x2_test.fit(X_test_labels_x2)
X_test_transformed_x2 = label_enc_x2_test.transform(X_test_labels_x2)

In [4]:
# Get final inputs for the model
X_train = np.column_stack((X_train_transformed_x1, X_train_transformed_x2))
Y_train = df_train.iloc[:,2].to_numpy()

X_val = np.column_stack((X_val_transformed_x1, X_val_transformed_x2))
Y_val = df_val.iloc[:,2].to_numpy()

X_test = np.column_stack((X_test_transformed_x1, X_test_transformed_x2))

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8-whitegrid')

# SVM
# NN ??
# print(X)

# --- Random Forest Regressor ---
# num_estimators = range(10, 110, 10)
# val_scores = []
# best_model_score = -np.inf
# best_model = None

# # Grid search for the best number of estimators
# for i in num_estimators:

#     # Train the model
#     model = RandomForestRegressor(n_estimators=i, random_state=42)
#     model.fit(X_train, Y_train)

#     # Calculate model error on validation set
#     val_scores.append(model.score(X_val, Y_val))
#     print(f"RandomForestRegressor with {i} estimators has a score of {val_scores[-1]}")

#     # Save the best model
#     if val_scores[-1] > best_model_score:
#         best_model_score = val_scores[-1]
#         best_model = model

# # Plot the validation scores for each number of estimators
# plt.plot(num_estimators, val_scores)
# plt.xlabel("Number of estimators")
# plt.ylabel("Validation score")
# plt.show()

# Get the predictions
# rfr_predictions = model_rfr.predict(X_test)

# --- Linear Regression ---
# lr = LinearRegression(n_jobs=-1)
# lr.fit(X_train, Y_train)

# # Test on the val set
# print(lr.score(X_val, Y_val))

# # Get the predictions
# lr_predictions = lr.predict(X_test)





0.009336002075703176
