The ECFP-fingerprint (Extended Connectivity Fingerprint) is a type of molecular fingerprint that contains information about the connectivity of atoms in a molecule. This fingerprint is based on the molecule's structure and is commonly used in cheminformatics for various applications such as molecular recognition, virtual screening, and chemical diversity.

Information that can be included in an ECFP-fingerprint includes:

Structural connectivity: The fingerprint takes into account the connections between atoms in the molecule, including the bonds and how they are arranged.

Molecular fragments: The molecule is divided into different fragments or substructures, each of which is encoded as part of the fingerprint. These fragments can vary in size depending on the chosen radius for the fingerprint.

Spatial arrangement: The fingerprint may contain information about the spatial arrangement of atoms and bonds within certain distances of a reference atom. This helps capture three-dimensional aspects of the molecular structure.

Bits: Each bit in the fingerprint represents the presence or absence of a particular substructure or fragment in the molecule. A '1' at a particular position indicates that the corresponding fragment is present, while a '0' indicates that the fragment is absent.

Overall, the ECFP-fingerprint provides a structured representation of the molecular structure, allowing molecules to be compared based on their structural similarities and differences. This information is often used in machine learning models for molecular predictions such as ligand-receptor binding, activity predictions, and chemical properties.

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
data=pd.read_csv('cleaned_descriptor_data',index_col=0)
molecules=list(data.index)
molecules

['C=C(C)c1nc(N)nc(N)n1',
 'C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1',
 'C=CCNC(=O)CCCC(=O)NCC=C',
 'C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21',
 'C=CCn1cc(Cl)c(=O)n(CC=C)c1=O',
 'CC(=O)N(C)c1ccc(NC(=O)c2ccco2)cc1',
 'CC(=O)c1c(N(C)C)cc[nH+]c1C(F)(F)F',
 'CC(C)(N[O-])/C(=N/O)c1cccs1',
 'CC(C)C(=O)Nc1cc(=O)nc2nc[nH]n12',
 'CC(C)[NH+](Cc1nc2ccccc2[nH]1)C(C)C',
 'CC(C)c1ccc(NC(=O)c2cccnc2)cc1',
 'CC(Nc1ccccc1)=[NH+]c1ccccc1',
 'CCC(CC)C(=O)NCc1ccccc1OC',
 'CCC/C(=N\\O)c1c[nH]c2ccccc12',
 'CCCC(=O)Nc1cc(=O)nc2nc[nH]n12',
 'CCCCCCP(=O)([O-])c1ccccc1',
 'CCCCOC(=O)c1cc(=O)[nH]c2ccccc12',
 'CCCCP(=O)([O-])CCC(=O)[O-]',
 'CCCOc1ccc2ncc(C(=O)[O-])c(O)c2c1',
 'CCC[C@@](O)(CC[NH+](C)C)c1ccc(OC)cc1',
 'CCCc1ccc(OCCNC(=O)CC)cc1',
 'CCN(CC)S(=O)(=O)c1cccc2nonc12',
 'CCOC(=O)CCc1nc2ccccc2[nH]1',
 'CCOC(=O)N=[S@@](N)(=O)c1ccc(Cl)cc1',
 'CCOC(=O)NCc1ccc2c(c1)cc(C)n2C',
 'CCOC(=O)Nc1cccc2ccccc12',
 'CCOC(=O)c1cccc(C(=O)OCC)n1',
 'CCOc1ccc(C(=O)[O-])cc1OC',
 'CCOc1ccc(CN2CCC(=O)NC2=O)cc1',
 'CC[NH+]1CCN(Cc2nnnn2C(C)(C)C)

## create dataframe that contains a bit for every element for every molecule

In [3]:
fingerprints = []
for smiles in molecules:
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    fingerprints.append(fp)
fingerprints_int = [list(fp.ToBitString()) for fp in fingerprints]
df = pd.DataFrame(fingerprints_int, columns=[f'Bit_{i+1}' for i in range(len(fingerprints_int[0]))])
df.index = molecules
df.info()





<class 'pandas.core.frame.DataFrame'>
Index: 1116 entries, C=C(C)c1nc(N)nc(N)n1 to c1ccc(C2=Nn3c(nnc3-c3cc(-c4ccccc4)n[nH]3)SC2)cc1
Columns: 2048 entries, Bit_1 to Bit_2048
dtypes: object(2048)
memory usage: 17.4+ MB


## try random forest with these fingerprints

In [7]:
# Split the dataset into training and testing sets
y = data['ERK2_inhibition']
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200)

# Train the model
nr_iterations = 10
for i in range(nr_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)
print(y_pred)

KeyboardInterrupt: 

## try with cleaned fingerprint data

In [None]:
cleaned_data=pd.read_csv('cleaned_fingerprint_data',index_col=0)
cleaned_data.head()


Unnamed: 0,Bit_696,Bit_1200,Bit_936,Bit_1089,Bit_1918,Bit_379,Bit_842,Bit_1153,Bit_876,Bit_927,...,Bit_1454,Bit_675,Bit_1921,Bit_388,Bit_1565,Bit_1100,Bit_393,Bit_492,ERK2_inhibition,PKM2_inhibition
C=C(C)c1nc(N)nc(N)n1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
C=C(Cl)COc1ccc2c(C)cc(=O)oc2c1,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
C=CCNC(=O)CCCC(=O)NCC=C,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
C=CCOn1c(=O)c(C)[n+]([O-])c2ccccc21,1,1,1,1,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
C=CCn1cc(Cl)c(=O)n(CC=C)c1=O,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
y = cleaned_data['ERK2_inhibition']
X = cleaned_data.drop(columns='ERK2_inhibition').drop(columns='PKM2_inhibition').copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(n_estimators=200)

# Train the model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rf.fit(X_train, y_train)

# Make predictions
y_pred = rf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(cm)
print("Classification Report:")
print(report)
print(y_pred)

Accuracy: 0.9375
Confusion Matrix:
[[210   0]
 [ 14   0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       210
           1       0.00      0.00      0.00        14

    accuracy                           0.94       224
   macro avg       0.47      0.50      0.48       224
weighted avg       0.88      0.94      0.91       224

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
