In [None]:
import numpy as np
import pandas as pd

# read data from file
data = pd.read_csv('Islander_data.csv')
df = data.drop(['first_name', 'last_name'],axis=1)
df = df[df.Drug != 'S'] # remove placebo from dataset
print(df)

Preprocessing

In [None]:
df['Happy_Sad_group'].unique()
df['Drug'].unique()

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

transformer = make_column_transformer(
    (OneHotEncoder(), ['Happy_Sad_group','Drug']),
    remainder = 'passthrough')

transformed = transformer.fit_transform(df)
encoded_df = pd.DataFrame(transformed, columns=transformer.get_feature_names_out())
encoded_df.head()

In [None]:
scaled_df = encoded_df.copy()
from sklearn.preprocessing import StandardScaler

scaled_df.iloc[:, 8] = scaled_df.iloc[:, 8]/scaled_df.iloc[:, 6] * 100
# scale selected columns by index (memory before, memory after, difference)
scaled_df.iloc[:, 6:8] = StandardScaler().fit_transform(scaled_df.iloc[:, 6:8])

# print(scaled_df.describe())

In [None]:
cleaned_df = scaled_df.drop(['remainder__Mem_Score_After'],axis=1)

data_label = cleaned_df['remainder__Diff']

cleaned_df = cleaned_df.drop(['remainder__Diff'],axis=1)
#print(cleaned_df.head())
#print(data_label.tolist())

K-Fold Cross Validation

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt

X = cleaned_df
y = data_label

#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

parameters = {'max_depth': 4, 'max_features': None, 'min_samples_leaf': 4, 'splitter': 'random'}

#build decision tree regression model
model = DecisionTreeRegressor(**parameters)

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',
                         cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

In [None]:
from sklearn import datasets, ensemble
import math
from sklearn.metrics import mean_squared_error

#build gradient boosting regression model
params = {'learning_rate': 0.007, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.2}

model = ensemble.GradientBoostingRegressor(**params)

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',
                         cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

In [None]:
from sklearn.linear_model import LinearRegression

#build multiple linear regression model
model = LinearRegression()

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',
                         cv=cv, n_jobs=-1)

mean(absolute(scores))

In [None]:
import seaborn as sns

df = df.replace(to_replace="H",value=0)
df = df.replace(to_replace="S",value=1)
df = df.replace(to_replace="A",value=0)
df = df.replace(to_replace="T",value=1)

# print(df)

# correlation matrix
cormat = df.drop(["Mem_Score_After"], axis=1).corr()
round(cormat,2)

sns.heatmap(cormat, annot=True);

In [None]:
from sklearn.model_selection import train_test_split

# Xtrain,Xtest,Ytrain,Ytest = train_test_split(cleaned_df, data_label, test_size = 0.20, random_state = 42)
Xtrain,Xtest,Ytrain,Ytest = train_test_split(cleaned_df, data_label, test_size = 0.20)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

GBR = GradientBoostingRegressor()

parameters = {'learning_rate': np.arange(0.001, 0.01, 0.001),
              'subsample'    : [0.9, 0.5, 0.2, 0.1],
              'n_estimators' : np.arange(400, 1000, 200),
              'max_depth'    : np.arange(2, 5)
             }

grid_GBR = GridSearchCV(estimator=GBR, param_grid = parameters, cv = 5, n_jobs=-1)
grid_GBR.fit(Xtrain, Ytrain)  

In [None]:
print("\n The best parameters across ALL searched params:\n",grid_GBR.best_params_)

In [None]:
from sklearn import datasets, ensemble

#build gradient boosting regression model
params = {'learning_rate': 0.007, 'max_depth': 3, 'n_estimators': 400, 'subsample': 0.2}

model = ensemble.GradientBoostingRegressor(**params)

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',
                         cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

In [None]:
DTR = DecisionTreeRegressor()

parameters = {"splitter":["best","random"],
             "max_depth" : np.arange(2,6),
             "min_samples_leaf": np.arange(2,5),
             "max_features":["auto","log2","sqrt",None]}

grid_DTR = GridSearchCV(estimator=DTR, param_grid = parameters, cv = 5, n_jobs=-1)
grid_DTR.fit(Xtrain, Ytrain)  

In [None]:
print("\n The best parameters across ALL searched params:\n",grid_DTR.best_params_)

In [None]:
X = cleaned_df
y = data_label

#define cross-validation method to use
cv = KFold(n_splits=10, random_state=1, shuffle=True)

parameters = {'max_depth': 4, 'max_features': None, 'min_samples_leaf': 4, 'splitter': 'random'}

#build decision tree regression model
model = DecisionTreeRegressor(**parameters)

#use k-fold CV to evaluate model
scores = cross_val_score(model, X, y, scoring='neg_root_mean_squared_error',
                         cv=cv, n_jobs=-1)

#view mean absolute error
mean(absolute(scores))

In [None]:
import matplotlib.pyplot as plt
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
import math

# decision tree
regr_1 = DecisionTreeRegressor(**parameters)
regr_1.fit(Xtrain, Ytrain)

y_1 = regr_1.predict(Xtest)

mrse = math.sqrt(mean_squared_error(Ytest, y_1))
#print(y_1)
print("decision tree MRSE: {:.4f}".format(mrse)) 

# gradient boosting decision tree
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(Xtrain, Ytrain)
y_2 = reg.predict(Xtest)

mrse = math.sqrt(mean_squared_error(Ytest, y_2))
#print(y_2)
print("gradient booting MRSE: {:.4f}".format(mrse))

In [None]:
print(Xtrain)

fig, axs = plt.subplots(1, 3)
axs[0].scatter(Xtrain["remainder__age"], Ytrain, s=6)

axs[0].set_title('Age')
axs[2].scatter(Xtrain["remainder__Dosage"], Ytrain, s=6)
axs[2].set_title('Dosage')
axs[1].scatter(Xtrain["remainder__Mem_Score_Before"], Ytrain, s=6)
axs[1].set_title('Memory Before')

plt.savefig("djfs.png",dpi=300)
files.download("djfs.png") 

In [None]:
# plot figure comparing the data to the predicted values by the decistion tree and gradient boosting regressor
plt.figure()
x = np.arange(0, len(Ytest), 1, dtype=int)

# plt.scatter(x, Ytest, s=20, c="red", label="data")
plt.plot(x, Ytest, color ="red", label ="Actual", linewidth=1)
plt.plot(x, y_1, color ="cornflowerblue", label="Predicted", linewidth=1)

plt.ylabel("Percent difference in memory score")
plt.title("Decision Tree")
plt.legend()
#plt.show()

plt.tight_layout()
plt.savefig("decision_tree.png",dpi=300)
#files.download("decision_tree.png") 

In [None]:
# plot figure comparing the data to the predicted values by the decistion tree and gradient boosting regressor
plt.figure()
x = np.arange(0, len(Ytest), 1, dtype=int)

# plt.scatter(x, Ytest, s=20, c="red", label="data")
plt.plot(x, Ytest, color ="red", label ="Actual", linewidth=1)
plt.plot(x, y_2, color ="green", label="Predicted", linewidth=1)

plt.ylabel("Percent difference in memory score")
plt.title("Gradient Boosting")
plt.legend()
# plt.show()

plt.tight_layout()
plt.savefig("gradient_boosting.png",dpi=300)
#files.download("gradient_boosting.png") 

In [None]:
linreg = LinearRegression().fit(Xtrain, Ytrain)
y_5 = linreg.predict(Xtest)

# plot figure comparing the data to the predicted values by the decistion tree and gradient boosting regressor
plt.figure()
x = np.arange(0, len(Ytest), 1, dtype=int)

mrse = math.sqrt(mean_squared_error(Ytest, y_5))
print("MRSE: {:.4f}".format(mrse))

# plt.scatter(x, Ytest, s=20, c="red", label="data")
plt.plot(x, Ytest, color ="red", label ="Actual", linewidth=1)
plt.plot(x, y_5, color ="orange", label="Predicted", linewidth=1)

plt.ylabel("Percent difference in memory score")
plt.title("Linear Regression")
plt.legend()
#plt.show()

plt.tight_layout()
plt.savefig("lin_reg.png",dpi=300)
#files.download("lin_reg.png") 

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
ax1.plot(x, y)
ax2.plot(x, -y)

In [None]:
plt.figure()
plt.plot(x, Ytest, color ="red", label ="Actual", linewidth=1)
plt.plot(x, y_1, color ="cornflowerblue", label="Decision Tree", linewidth=1)
plt.plot(x, y_2, color ="green", label="Gradient Boosting", linewidth=1)
plt.plot(x, y_5, color ="orange", label="Linear Regression", linewidth=1)

plt.ylabel("Percent difference in memory score")
plt.legend()

plt.tight_layout()
plt.savefig("all_vs_actual.png",dpi=300)
#files.download("all_vs_actual.png") 

Plots

In [None]:
g = np.concatenate((np.full((1, 27), 'Decision Tree'), np.full((1, 27), 'Gradient Boosting'), np.full((1, 27), 'Linear Regression')), axis=1)
g=g[0]

In [None]:
import seaborn as sns

DTR_data = pd.DataFrame({'Measured values': np.concatenate((Ytest, Ytest, Ytest), axis=0),
                         'Predicted values': np.concatenate((y_1, y_2, y_5), axis=0),
                         ' ': g
                         })

##print(DTR_data)
sns.lmplot(x="Measured values", y="Predicted values", hue=' ', data=DTR_data)
#plt.legend()


# plt.tight_layout()
sns.set(rc={'figure.figsize':(200,500)})
plt.savefig("all_models.png",dpi=300)
files.download("all_models.png") 

In [None]:
import seaborn as sns

plt.savefig("all_models.png",dpi=300)
files.download("all_models.png") 