<a href="https://colab.research.google.com/github/dderaad/Hackathon-Summer-2021/blob/main/clfs_mk1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [2]:
# All of the paths for the relevant data sets are compiled into a list of urls
 
PATH = "https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/"
files = [
         "prediction/prediction.csv", 
         "test_data/test_LABELS.csv", 
         "test_data/test_feature_names.csv", 
         "test_data/test_expression.csv.gz", 
         "train_data/train_labels.csv", 
         "train_data/train_feature_names.csv", 
         "train_data/train_expression.csv.gz"
         ]
urls = [PATH + x for x in files]
urls

['https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/prediction/prediction.csv',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/test_data/test_LABELS.csv',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/test_data/test_feature_names.csv',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/test_data/test_expression.csv.gz',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/train_data/train_labels.csv',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/train_data/train_feature_names.csv',
 'https://raw.githubusercontent.com/dderaad/Hackathon-Summer-2021/main/train_data/train_expression.csv.gz']

In [None]:
# The datasets are read into a python dictionary with their simplified path names as their keys
 
data_sets = {}
for url, file_name in zip(urls, files):
  data_sets[file_name] = pd.read_csv(url)
 
for data_set_name in data_sets:
  print(data_set_name, "\n", data_sets[data_set_name].head(5), "\n")

In [None]:
labelled_training_data = pd.concat([data_sets["train_data/train_labels.csv"],
                                    data_sets["train_data/train_expression.csv.gz"] 
                                    ], axis=1)
labelled_training_data["sum"] = labelled_training_data.drop(["sample_id", "age", "group"], axis=1).sum(axis=1)
def sum_of_squares(x):
  return np.sum(np.power(x, 2))
labelled_training_data["squared_sum"] = labelled_training_data.drop(["sample_id", "age", "group"], axis=1).apply(sum_of_squares, axis=1)
labelled_training_data.head(10)

In [None]:
labelled_testing_data = pd.concat([data_sets["test_data/test_LABELS.csv"],
                                    data_sets["test_data/test_expression.csv.gz"] 
                                    ], axis=1)
labelled_testing_data["sum"] = labelled_testing_data.drop(["sample_id", "group"], axis=1).sum(axis=1)
labelled_testing_data["squared_sum"] = labelled_testing_data.drop(["sample_id", "group"], axis=1).apply(sum_of_squares, axis=1)
labelled_testing_data.head(10)

In [None]:
#labelled_testing_data.describe()

In [None]:
#labelled_training_data.describe()

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.dummy import DummyRegressor
clf_strategies = ["most_frequent", "uniform"]
reg_stratgies = ["mean", "median"]
dummy_classifiers = {f"Dummy Classifier ({i})": DummyClassifier(i) for i in clf_strategies}
dummy_regressors = {f"Dummy Regressor ({j})": DummyRegressor(j) for j in reg_stratgies}
dummy_classifiers.update(dummy_regressors)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF

clfs = {
    #"Decision Tree": DecisionTreeClassifier(max_depth=3),
    "Random Forest": RandomForestRegressor(n_estimators=200, max_depth=6, max_features='sqrt', random_state=1, min_samples_leaf=2),
    #"AdaBoost": AdaBoostClassifier(),
    #"Linear SVM": SVC(kernel="linear", C=0.025),
    #"RBF SVM": SVC(gamma=2, C=1),
    #"Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0))
}
 
classifiers = {}
#classifiers.update(dummy_classifiers)
classifiers.update(clfs)

In [None]:
# format: X, y
Xy_data_set = [
               labelled_training_data.drop(["sample_id", "group", "age"], axis=1),
               labelled_training_data["age"]
               ]
Xy_data_set
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(Xy_data_set[0])
Xy_data_set_scaled = Xy_data_set
Xy_data_set_scaled[0] = scaler.transform(Xy_data_set[0])
 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Xy_data_set_scaled[0], Xy_data_set_scaled[1], test_size=0.15, random_state=1)
X_train

In [None]:
from sklearn.metrics import mean_squared_error
 
for clf_name in classifiers:
  classifiers[clf_name].fit(X_train, y_train)
  score = classifiers[clf_name].score(X_test, y_test)
  y_pred = classifiers[clf_name].predict(X_test)
  print(f'{clf_name}\nMSE: {mean_squared_error(y_test, y_pred)} \n{score}\n')

In [None]:
prediction = classifiers["Random Forest"].predict(labelled_testing_data.drop(["sample_id", "group"], axis=1))
df = pd.concat([pd.Series(prediction, name="age"), labelled_testing_data["sample_id"]], axis=1)
df.to_csv("prediction\prediction.csv", index=False)
from google.colab import files
files.download("prediction\prediction.csv")
df

In [None]:
labelled_training_data["age"].hist()

In [None]:
import statsmodels.api as sm
result = sm.OLS(Xy_data_set_scaled[1], Xy_data_set_scaled[0])
result.fit()

In [None]:
print(result.summary())

In [None]:
labelled_training_data