## Different machine learning models

### Import libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns

### Import the dataset

In [None]:
dataset = pd.read_csv('/padel/selected_features.csv')

selected_features = ['MDEC-33', 'VE1_Dzp', 'ATSC6e', 'minaaN', 'SpMax4_Bhm', 'nAtomLAC', 'VE3_Dzs']

# Separate 'Molecule', features (X), and target (y)
molecule = dataset[['Molecule']]            # Molecule column as DataFrame
X = dataset[selected_features]              # Selected features as DataFrame
y = dataset[['pEC50']]                      # Target (pIC50) column as DataFrame

# Concatenate Molecule, X, and y into a single DataFrame for train-test split
full_data = pd.concat([molecule, X, y], axis=1)

### Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

# Bin y into bins
y_binned = pd.qcut(y.squeeze(), q=5, duplicates='drop')

# Perform the train-test split on the full DataFrame
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42, stratify=y_binned)

# Save the training data to 'train_data.csv'
train_data.to_csv('/padel/train_data.csv', index=False)
print("Training data saved to 'train_data.csv'.")

# Save the test data to 'test_data.csv'
test_data.to_csv('/padel/test_data.csv', index=False)
print("Test data saved to 'test_data.csv'.")

In [None]:
print(train_data.shape)
print(test_data.shape)

In [None]:
# Plotting the distribution of pIC50 for each set
plt.figure(figsize=(10, 6))

# Plot for the full dataset
plt.hist(full_data['pEC50'], bins=20, alpha=0.5, color='blue', label='Total Set')

# Plot for the training set
plt.hist(train_data['pEC50'], bins=20, alpha=0.5, color='green', label='Training Set')

# Plot for the test set
plt.hist(test_data['pEC50'], bins=20, alpha=0.5, color='red', label='Test Set')

# Add title and labels
plt.title('Distribution of pIC50 values')
plt.xlabel('pEC50')
plt.ylabel('Frequency')
plt.legend()

# Show plot
plt.show()

In [None]:
# Plotting Kernel Density
plt.figure(figsize=(8, 6))
sns.set_style("ticks")

sns.kdeplot(full_data['pEC50'], color='blue', label='Total Set', fill=True, alpha=0.3, linewidth=2)
sns.kdeplot(train_data['pEC50'], color='green', label='Training Set', fill=True, alpha=0.3, linewidth=2)
sns.kdeplot(test_data['pEC50'], color='red', label='Test Set', fill=True, alpha=0.3, linewidth=2)

#plt.title('Probability')
plt.xlabel('pEC50', fontsize=18, fontweight='bold', labelpad=15)
plt.ylabel('Probability', fontsize=18, fontweight='bold', labelpad=15)
plt.legend(title_fontsize=12, fontsize=12, loc='upper right')

plt.xticks(fontsize=14, fontweight='bold')
plt.yticks(fontsize=14, fontweight='bold')

plt.tick_params(axis='both', which='major', length=8, width=2, labelsize=14)  # Major ticks
plt.tick_params(axis='both', which='minor', length=4, width=2, labelsize=12)  # Minor ticks

ax = plt.gca()  # Get current axes
ax.xaxis.set_major_locator(ticker.MultipleLocator(1.0))  # Major ticks every 0.5
ax.xaxis.set_minor_locator(ticker.AutoMinorLocator(1))  # 2 minor ticks between major ticks
ax.yaxis.set_major_locator(ticker.MultipleLocator(0.1))  # Major ticks every 0.1

plt.tick_params(axis='both', which='major', direction='in', length=8, width=2, labelsize=16)  # Major ticks
plt.tick_params(axis='both', which='minor', direction='in', length=4, width=2)  # Minor ticks

for spine in ax.spines.values():
    spine.set_linewidth(3)

# Add minor ticks
ax.minorticks_on()

plt.show()

plt.savefig("/padel/dist_pEC50.jpg", format='jpg', dpi=600)

In [None]:
X_train = train_data.iloc[:, 1:-1].values
y_train = train_data.iloc[:, -1].values
X_test = test_data.iloc[:, 1:-1].values
y_test = test_data.iloc[:, -1].values

### 1. Multiple linear regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean performance:", scores.mean())

In [None]:
# Get the intercept
intercept = regressor.intercept_

# Get the coefficients
coefficients = regressor.coef_

print(f"Intercept: {intercept}")
print(f"Coefficients: {coefficients}")

In [None]:
# Assuming feature names are stored in a list
feature_names = ['MDEC-33', 'VE1_Dzp', 'ATSC6e', 'minaaN', 'SpMax4_Bhm', 'nAtomLAC', 'VE3_Dzs']

# Create a mapping of feature names to coefficients
coef_mapping = dict(zip(feature_names, coefficients))
print("Feature Coefficients:")
print(coef_mapping)

In [None]:
# Add predicted values as a new column next to 'pEC50'
train_data['pEC50_pred'] = y_train_pred
test_data['pEC50_pred'] = y_test_pred

# Save the updated DataFrames to new CSV files
train_data.to_csv('/padel/train_data_pred.csv', index=False)
test_data.to_csv('/padel/test_data_pred.csv', index=False)

### 2. Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

In [None]:
y_train_pred = regressor.predict(poly_reg.transform(X_train))
y_test_pred = regressor.predict(poly_reg.transform(X_test))

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean performance:", scores.mean())

### 3. Decission tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 42)
regressor.fit(X_train, y_train)

In [None]:
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean performance:", scores.mean())

### 4. Random Forest 

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 42)
regressor.fit(X_train, y_train)

In [None]:
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

In [None]:
from sklearn.metrics import r2_score
print(r2_score(y_train, y_train_pred))
print(r2_score(y_test, y_test_pred))

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(regressor, X_train, y_train, cv=5)
print("Cross-validation scores:", scores)
print("Mean performance:", scores.mean())