# Regression

In [1]:
# Essential libraries for data manipulation
import numpy as np #linear algebra
import pandas as pd # data processing
import matplotlib.pyplot as plt # graph plots


import seaborn as sns # plots
%matplotlib inline


# basic styling for graphs
plt.style.use('ggplot')

font = {"family": "Azeret Mono",
"weight": "bold",
"size": 14}

plt.rcParams.update({"font.family": font["family"], "font.weight": font["weight"], "font.size": font["size"]})

# loads data
df = pd.read_csv("data/01_regression/solubility.csv")


# take a look at the data
print(f"Data shape: {df.shape}")

df.head()

Data shape: (1144, 5)


Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
0,2.5954,167.85,0.0,0.0,-2.18
1,2.3765,133.405,0.0,0.0,-2.0
2,2.5938,167.85,1.0,0.0,-1.74
3,2.0289,133.405,1.0,0.0,-1.48
4,2.9189,187.375,1.0,0.0,-3.04


## 1.0 Data Cleaning

Ps: This DS made it very easy since the data is already cleaned. No null-values are present.

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1144 entries, 0 to 1143
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   MolLogP             1144 non-null   float64
 1   MolWt               1144 non-null   float64
 2   NumRotatableBonds   1144 non-null   float64
 3   AromaticProportion  1144 non-null   float64
 4   logS                1144 non-null   float64
dtypes: float64(5)
memory usage: 44.8 KB


In [3]:
df.describe()

Unnamed: 0,MolLogP,MolWt,NumRotatableBonds,AromaticProportion,logS
count,1144.0,1144.0,1144.0,1144.0,1144.0
mean,2.449133,204.631675,2.173951,0.364932,-3.057997
std,1.866003,102.6205,2.627398,0.343305,2.096502
min,-7.5714,16.043,0.0,0.0,-11.6
25%,1.4149,122.126,0.0,0.0,-4.33225
50%,2.3403,183.5945,1.0,0.375,-2.8705
75%,3.406475,270.71575,3.0,0.666667,-1.6
max,10.3886,780.949,23.0,1.0,1.58


## Data preparation

In [4]:
# separate X & y
X  = df.drop("logS", axis=1)

y = df["logS"]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

X_train.shape

(915, 4)

## Model Building



In [6]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()
linreg.fit(X_train,y_train)

y_hat = linreg.predict(X_test)

print(f"{linreg.score(X_test, y_test):.3f}")

0.788


In [7]:
from sklearn.metrics import mean_squared_error, r2_score

linreg_train_mse = mean_squared_error(y_train, linreg.predict(X_train))
linreg_test_mse = mean_squared_error(y_hat,y_test)

linreg_train_r2 = r2_score(linreg.predict(X_train),y_train)
linearreg_test_r2 = r2_score(y_hat, y_test)


print("Test")
print(f"MSE {linreg_test_mse:.3f}")
print(f"R2 {linearreg_test_r2:.3f}")


print("\n\nTrain")
print(f"MSE {linreg_train_mse:.3f}")
print(f"R2 {linreg_train_r2:.3f}")


Test
MSE 0.970
R2 0.724


Train
MSE 1.021
R2 0.693


In [8]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=10, max_depth=2, )
forest.fit(X_train,y_train)

y_hat = forest.predict(X_test)

forest_train_mse = mean_squared_error(y_train, forest.predict(X_train))
forest_test_mse = mean_squared_error(y_hat,y_test)

forest_train_r2 = r2_score(forest.predict(X_train),y_train)
forest_test_r2 = r2_score(y_hat, y_test)


print("Test")
print(f"MSE {linreg_test_mse:.3f}")
print(f"R2 {linearreg_test_r2:.3f}")


print("\n\nTrain")
print(f"MSE {linreg_train_mse:.3f}")
print(f"R2 {linreg_train_r2:.3f}")



Test
MSE 0.970
R2 0.724


Train
MSE 1.021
R2 0.693
