In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors

In [None]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")
vocab = pd.read_csv("data/vocabulary.txt", header=None)

In [None]:
fingerprints = pd.read_csv("pubchem_fingerprints.csv")

In [None]:
train_df = train.merge(fingerprints, on="SMILES", how="left")
test_df = test.merge(fingerprints, on="SMILES", how="left")
print(train_df.fingerprint.isnull().sum(), "train molecules have no associated fingerprint")
print(test_df.fingerprint.isnull().sum(), "test molecules have no associated fingerprint")

In [None]:
def to_bits(x):
    try:
        unpacked = np.unpackbits(np.frombuffer(bytes.fromhex(x), dtype=np.uint8))
    except Exception as e:
        print(e)
        print(x)
        
    return unpacked

train_df = train_df[~train_df.fingerprint.isnull()]
train_fingerprints = train_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])
train_fingerprints = np.stack(train_fingerprints.values)

test_df = test_df[~test_df.fingerprint.isnull()]
test_fingerprints = test_df.fingerprint.apply(to_bits)#lambda fingerprint_string: [x=='1' for x in fingerprint_string])
test_fingerprints = np.stack(test_fingerprints.values)

In [None]:
nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(train_fingerprints)
distances, neighbour_indices = nbrs.kneighbors(test_fingerprints)

In [None]:
for i, neighbours in zip(test_df.index, neighbour_indices):
    test.loc[i, "PREDICTIONS"] = ";".join([train.loc[train_df.index[x], "SENTENCE"] for x in neighbours])

In [None]:
train.SENTENCE.value_counts()[:5]

In [None]:
default_prediction = ";".join(train.SENTENCE.value_counts()[:5].index)

In [None]:
test.loc[test.PREDICTIONS.isnull(), "PREDICTIONS"] = default_prediction