# Install RDKIT and import other dependencies

In [None]:
# Install rdkit in colab if necessary

!pip install rdkit

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
import requests
import numpy as np
import random
import zipfile
from io import BytesIO
import os

# Load some data

In [None]:
# Get data for training and testing (DON'T MODIFY THIS CODE!)

data_id = random.choice(
    [10030, 10503, 11161, 11705, 12106, 12869, 13239, 14720, 15380, 15629, 15760, 19911, 23859, 27649, 28818, 5143, 5616, 7827]
)

url = f"https://raw.githubusercontent.com/durrantlab/colab-support-files/main/qsar_datasets/{data_id}.zip"
response = requests.get(url)

smi_files_dict = {}

# Checking if the request was successful
if response.status_code == 200:
    # Using BytesIO to treat the bytes content as a file-like object
    with zipfile.ZipFile(BytesIO(response.content)) as z:
        # Filtering for files ending with '.smi'
        smi_files = [f for f in z.namelist() if f.endswith('.smi')]
        
        # Extracting and adding the content of each .smi file to the dictionary
        for smi_file in smi_files:
            with z.open(smi_file) as file:
                # Reading the content of the file
                content = file.read().decode('utf-8')
                # Getting the basename of the file (without path and extension)
                basename = os.path.splitext(os.path.basename(smi_file))[0]
                smi_files_dict[basename] = content

testing_set_data_txt = smi_files_dict['testing_set']
training_set_data_txt = smi_files_dict['training_set']

# Split the data by lines
testing_set_lines = testing_set_data_txt.split('\n')
training_set_lines = training_set_data_txt.split('\n')

# Randomly shuffly the data.
random.shuffle(testing_set_lines)
random.shuffle(training_set_lines)

In [None]:
# Each line in the training data has two columns. First is SMILES string, second
# is whether it is active or inactive.

print(training_set_lines[:5])

In [None]:
# Testing data has two columns. SMILES string and a unique identifier. The
# testing data does not indicate whether the molecule is active or inactive.
# That's what you'll predict.

print(testing_set_lines[:5])

# Extract information from the data

In [None]:
# Get the SMILES strings alone for the training and testing set.

training_set_smiles = YOUR CODE HERE
testing_set_smiles = YOUR CODE HERE

# training_set_smiles and testing_set_smiles look like:
#
# ['COc1ccc(S(=O)(=O)N(CC2CCCC2)C[C@@H](O)CN(CCc2ccccc2)C(=O)OCc2cccnc2)cc1',
#  'Cc1cc(SC2=C([O-])O[C@](CCc3ccc(O)cc3)(C(C)C)CC2=O)c(C(C)(C)C)cc1OS(C)(=O)=O',
#  'Cc1cc(SC2=C([O-])C[C@@](CCc3ccc(O)cc3)(C(C)C)OC2=O)c(C(C)(C)C)cc1OS(C)(=O)=O',
#  ...]

In [None]:
# Get the labels for the training and testing set (Active/Inactive in the case
# of the training set, and the compound id in the case of the testing set).

training_set_labels = YOUR CODE HERE
testing_set_labels = YOUR CODE HERE

# training_set_labels looks like ['Active', 'Active', 'Active', ...]
#
# testing_set_labels looks like ['Compound_10142', 'Compound_11090', 'Compound_11573', ...]

In [None]:
# In the case of the training data, the label must be a number, not a string.

training_set_labels_numeric = YOUR CODE HERE

# training_set_labels looks like [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...]

In [None]:
# Convert the SMILES strings of the training and seting set to rdkit molecule
# objects.

training_set_mols = YOUR CODE HERE
testing_set_mols = YOUR CODE HERE

# Create features (numeric vectors) to describe each molecule

In [None]:
# Create a function that returns a list of numeric features for a molecule. For
# example, you could use a Morgan Fingerprint. Alternatively, you could return a
# list of the four Lipinski descriptors.

def features(mol):
    list_of_numeric_features = YOUR CODE HERE
    return list_of_numeric_features

# Get the features for the training- and testing-set molecules.

training_set_features = [features(mol) for mol in training_set_mols]
testing_set_features = [features(mol) for mol in testing_set_mols]

# Divide the data into training and validation sets

In [None]:
# Divide the training set into a training set and a validation set. 50/50 split.

YOUR CODE HERE

validation_features = YOUR CODE HERE (values taken from training_set_features)
validation_labels = YOUR CODE HERE (values taken from training_set_labels_numeric)

training_features = YOUR CODE HERE (values taken from training_set_features)
training_labels = YOUR CODE HERE (values taken from training_set_labels_numeric)

# Train a classifier on the training data, test it on the validaton data

In [None]:
# Train a classifier on the training set and evaluate it on the validation set.
# You should use the accuracy_score() function to calculate the accuracy between
# validation_labels and predicted_labels.

from sklearn.ensemble import YOUR CODE HERE
from sklearn.metrics import accuracy_score

classifier = YOUR CODE HERE
YOUR TRAINING CODE HERE

predicted_labels = YOUR CODE HERE
accuracy = accuracy_score(YOUR CODE HERE)

print(f'Validation accuracy: {accuracy:.2%}')

# Apply the classifier to the withheld test data, and upload results to Canvas

In [None]:
# Now apply the classifer to the testing set and save the results to a file.
# Upload this file, as well as a copy of your notebook, to Canvas. DO NOT CHANGE
# THIS CODE.

predicted_labels = classifier.predict(testing_set_features)

with open('predicted_labels.txt', 'w') as f:
    f.write(f"{data_id}\n")
    for idx, label in enumerate(predicted_labels):
        # print(label)
        if label == 1:
            f.write(f'{testing_set_labels[idx]} Active\n')
            print("Predicted active: ", testing_set_labels[idx])

# Download file to local machine (colab)

from google.colab import files
files.download('predicted_labels.txt')