#  <center> Problem Set 6 <center>

<center>  7.C01/7.C51, 20.C01/20.C51 <center>

In [None]:
# import packages
from tqdm import tqdm
import numpy as np
import sklearn
import pandas as pd
from sklearn.model_selection import train_test_split

# metrics 
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

# plotting style, you can choose your own parameters
import matplotlib

matplotlib.rcParams.update({'font.size': 15})
matplotlib.rc('lines', linewidth=3, color='g')
matplotlib.rcParams['axes.linewidth'] = 2.0
matplotlib.rcParams['axes.linewidth'] = 2.0
matplotlib.rcParams["xtick.major.size"] = 6
matplotlib.rcParams["ytick.major.size"] = 6
matplotlib.rcParams["ytick.major.width"] = 2
matplotlib.rcParams["xtick.major.width"] = 2
matplotlib.rcParams['text.usetex'] = False

In [None]:
!pip install rdkit

We have provided the code to load the dataset. Take a moment to understand what each line is doing. Briefy explain what each line of the code is doing by providing short comments below. 

In [None]:
!wget https://raw.githubusercontent.com/coleygroup/ML4MolEng/main/psets/ps6-drug-screen/data/prism_train.csv
!wget https://raw.githubusercontent.com/coleygroup/ML4MolEng/main/psets/ps6-drug-screen/data/prism_test.csv
!wget https://raw.githubusercontent.com/coleygroup/ML4MolEng/main/psets/ps6-drug-screen/data/prism_cell_line_metadata.csv
!wget https://raw.githubusercontent.com/coleygroup/ML4MolEng/main/psets/ps6-drug-screen/data/prism_perturbation_metadata.csv

In [None]:
prism = pd.read_csv(
    "prism_train.csv",
    header="infer",
    index_col=0,
)
prism_test = pd.read_csv(
    "prism_test.csv",
    header="infer",
    index_col=0,
)

# Auxillary Datasets
cell_line_info = pd.read_csv(
    "prism_cell_line_metadata.csv",
    header="infer",
    index_col=0,
)
treatment_info = pd.read_csv(
    "prism_perturbation_metadata.csv",
    header="infer",
    index_col=0,
)

## Part 1: Preliminary Data Cleaning & Visualization

In [None]:
y = prism.to_numpy()
X = prism.index.to_list()
X[:10], y

### Part 1.1 (5 points) Compute Morgan Fingerprints For Each Molecule

In [None]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs

#########your implementation here######### 

# Use these methods:
#    Chem.MolFromSmiles()
#    AllChem.GetHashedMorganFingerprint() with radius = 2 & nBits=1024
#    DataStructs.ConvertToNumpyArray()

#########your implementation here#########

print(X.shape)
X

### Part 1.2 (2 points) Separate Dataset into Train / Val Splits (0.8 / 0.2)

### Part 1.3: (3 points) Perform Missing Data Imputation for the Training Set

In [None]:
from sklearn.impute import KNNImputer

y_train_raw = y_train.copy() # Leave untouched for later visualizations
print("We must impute this many missing cell values: ", np.isnan(y_train).sum())

#########your implementation here#########

#########your implementation here#########

In [None]:
import matplotlib as mpl

plt.hist(
    y_train_raw[~np.isnan(y_train)].flatten(),
    bins=30,
    alpha=0.5,
    color="blue",
    label="raw y_train",
)
plt.hist(
    y_train.flatten(),
    bins=30,
    alpha=0.5,
    color="green",
    label="imputed y_train",
)
plt.legend()
plt.ylabel("Frequency")
plt.xlabel("Viability Fold Change")
plt.show()

fig = plt.figure(figsize=(15, 3))

plt.title('Viability Fold Change')
plt.ylabel("Cell Lines")
plt.xlabel("Chemical Perturbation")
img = plt.imshow(y_train.T, vmin=-2, vmax=2)
fig.colorbar(img)
plt.show()

### Write a few sentences detailing the task, insights on the data, and what models would be a good fit

## Part 2: Baseline Prediction of Unseen Chemical Perturbations

In [None]:
def benchmark(y_true, y_pred):
    mask = ~np.isnan(y_true)  
    mse = np.mean((y_true[mask] - y_pred[mask])**2) 
    rmse = np.sqrt(mse) 

    plt.figure(figsize=(6,6))
    plt.scatter(y_pred, y_true, label='Eval RMSE: {:.4f}'.format(rmse))
    plt.ylabel("True Value")
    plt.xlabel("Predicted Value")
    plt.legend()
    plt.show()

### Part 2.1 (10 points) Baseline with KNN

### Part 2.2 (10 points) Baseline with Neural Network Regressor

## Part 3: (70 points) Machine Learning Competition and Report

In [None]:
def save_submission(model, X_test, csv_name):
    pred_df = pd.DataFrame(index=prism_test["SMILES"], columns=prism.columns)
    pred_df[:] = model.predict(X_test)
    # Annoyingly Kaggle can't seem to handle large matrix datasets 
    # & so we need to cut down our submission to just the first 30 cell lines
    pred_df.iloc[:, :30].to_csv(csv_name)
    return pred_df

In [None]:
X_test = prism_test['SMILES'].to_list()
new_X_test = []
for smiles in tqdm(X_test):
    morgan_fingerprints = AllChem.GetHashedMorganFingerprint(
        Chem.MolFromSmiles(smiles), 2, nBits=1024
    )
    arr = np.zeros((0,), dtype=np.int8)
    DataStructs.ConvertToNumpyArray(morgan_fingerprints, arr)
    new_X_test.append(np.array(arr))
X_test = np.array(new_X_test)