Welcome hopefully prospective employers!

Unlike LOKI, I'm doing this project exclusively to demonstrate my coding and commenting styles in a finished ML project.


So my goal here is to build a quick and dirty ML model to predict toxicity of biological compounds. The dataset is awesome, it's the ToxCast dataset from moleculenet.org, but it also only has 600 data points in it which may be too few for a complex model like this. I'm going to try to add dropout layers to maximize data but in the end it's more of a showcase than a solution, I have my other project that gets that kind of attention.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import BinaryCrossentropy
import tensorflow.keras.callbacks as callbacks
from tensorflow.keras.regularizers import l2
from sklearn.metrics import hamming_loss, f1_score

# Loading toxcast data (pandas df)
toxcast_df = pd.read_csv('toxcast_data.csv')

# Selecting relevant columns (all labels excluding SMILES)
labels = toxcast_df.columns[1:].tolist()

# Filling missing values with 0 and making them integers
toxcast_df[labels] = toxcast_df[labels].fillna(0).astype(int)

# Convert to numpy arrays
smiles = toxcast_df['smiles'].values
y = toxcast_df[labels].values

#checking out our new arrays
print(f"y shape is {y.shape}")
print(y[0:10], 0)
print(f"smiles shae is:{smiles.shape}")
print(smiles[0:2], 0)

#Take a look at the difference in quantity of 0 and 1. There's a full order of magnitude difference
plt.figure(figsize=(8, 6))
counts = [np.sum(y == 0), np.sum(y == 1)]
bars = plt.bar(['0s', '1s'], counts)
plt.title('Counts of 0s and 1s in y array')
plt.xlabel('Value')
plt.ylabel('Count')
for bar, count in zip(bars, counts):
    plt.text(bar.get_x() + bar.get_width()/2., bar.get_height(), count, 
             ha='center', va='bottom', fontsize=10)
plt.show()




Time to tokenize 

In [None]:
y_overall_toxicity = (y.sum(axis=1) > 0).astype(int)

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(smiles)
sequences = tokenizer.texts_to_sequences(smiles)
max_seq_length = max(len(seq) for seq in sequences)
X = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

X = np.array(X)
#y = np.array(y)
y_overall_toxicity = np.array(y_overall_toxicity)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_overall_toxicity, test_size=0.2, random_state=42)

a = np.random.randint(0, 1500)
print(a)
print(smiles[a])
print(f"Sample tokenized SMILES: {sequences[a]}")
print(f"Padded tokenized SMILES: {X[a]}")

In [None]:
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.6),
    Dense(1, activation='sigmoid')
])

#callback = callbacks.EarlyStopping(monitor='loss', patience=3)

model.compile(optimizer=Adam(learning_rate=0.001), loss=BinaryCrossentropy(), metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.2)#, callbacks=[callback])


In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy}')


In [None]:
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(int)

f1 = f1_score(y_test, y_pred_binary)
print(f'F1 Score: {f1}')



