In [244]:
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

import tensorflow as tf
from keras.utils import np_utils
from tensorflow.data import Dataset
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dropout, Dense, Activation
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ModelCheckpoint

In [245]:
MAIN_PATH = 'input/msbd5001-spring-2022/'
TRAIN_PATH = MAIN_PATH + 'train_normalized.csv'
TEST_PATH = MAIN_PATH + 'test_normalized.csv'

In [246]:
df_train = pd.read_csv(TRAIN_PATH, index_col=0)
y_train = df_train[['label']].to_numpy().reshape(-1,)
x_train = df_train.drop(columns=['label']).to_numpy()

In [247]:
def grid_search(clf, X, y, param_grid, cv=5):
    grid = GridSearchCV(clf, param_grid, cv=cv, scoring="accuracy")
    grid.fit(X, y)
    return pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'std_test_score']].sort_values(by='mean_test_score', ascending=False)

In [72]:
lr = LogisticRegression()
scores = cross_val_score(lr, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8960784313725491 [0.83333333 0.94117647 1.         0.82352941 0.88235294]


In [104]:
svc = SVC(kernel='rbf', C=1)
scores = cross_val_score(svc, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8738562091503267 [0.72222222 0.88235294 0.94117647 0.82352941 1.        ]


In [98]:
xgb = XGBClassifier(objective='binary:logistic', use_label_encoder=False)
scores = cross_val_score(xgb, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.872549019607843 [0.83333333 0.94117647 0.88235294 0.88235294 0.82352941]


In [105]:
lda = LinearDiscriminantAnalysis()
scores = cross_val_score(lda, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8483660130718954 [0.88888889 0.94117647 0.94117647 0.70588235 0.76470588]


In [143]:
rf = RandomForestClassifier(n_estimators=100, max_depth=6)
scores = cross_val_score(rf, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8836601307189543 [0.88888889 0.88235294 0.94117647 0.88235294 0.82352941]


In [127]:
knn = KNeighborsClassifier(n_neighbors=9)
scores = cross_val_score(knn, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8614379084967319 [0.77777778 0.88235294 0.94117647 0.82352941 0.88235294]


In [248]:
estimators = [
    ('lr', LogisticRegression()),
    ('svc', SVC(kernel='rbf', C=1)),
    ('lda', LinearDiscriminantAnalysis()),
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=6)),
    ('knn', KNeighborsClassifier(n_neighbors=9))
]

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
scores = cross_val_score(clf, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.8960784313725491 [0.83333333 0.88235294 1.         0.82352941 0.94117647]


In [249]:
rf_grid = {
    'n_estimators': [50, 75, 100],
    'max_depth': [3,6,9],
    'bootstrap': [True, False]
}
result = grid_search(RandomForestClassifier(), x_train, y_train, rf_grid, 10)
result.head()

Unnamed: 0,params,mean_test_score,std_test_score
11,"{'bootstrap': False, 'max_depth': 3, 'n_estimators': 100}",0.941667,0.058531
9,"{'bootstrap': False, 'max_depth': 3, 'n_estimators': 50}",0.930556,0.056928
12,"{'bootstrap': False, 'max_depth': 6, 'n_estimators': 50}",0.929167,0.080615
8,"{'bootstrap': True, 'max_depth': 9, 'n_estimators': 100}",0.919444,0.072648
1,"{'bootstrap': True, 'max_depth': 3, 'n_estimators': 75}",0.919444,0.052997


In [182]:
# {'bootstrap': False, 'max_depth': 3, 'n_estimators': 100}

{'bootstrap': False, 'max_depth': 3, 'n_estimators': 100}

In [196]:
rf = RandomForestClassifier(n_estimators=100, max_depth=3, bootstrap=False)
scores = cross_val_score(rf, x_train, y_train, cv=5)
print(scores.mean(), scores)

0.9424836601307189 [0.88888889 0.94117647 1.         0.94117647 0.94117647]


# Submission

In [263]:
x, x_val, y, y_val = train_test_split(x_train, y_train, test_size=0.2, shuffle=True)

rf = RandomForestClassifier(n_estimators=50, max_depth=3, bootstrap=False)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_val)
accuracy_score(y_val, y_pred)

1.0

In [264]:
df_test = pd.read_csv(TEST_PATH, index_col=0)
x_test = df_test.to_numpy()
submission_pred = rf.predict(x_test)
df_test['label'] = submission_pred

In [265]:
df_test[['label']].to_csv('submission/rf_n50_d3_bootFalse.csv')

# Balanced dataset

In [218]:
MAIN_PATH = 'input/msbd5001-spring-2022/'
TRAIN_PATH = MAIN_PATH + 'train_normalized.csv'
TEST_PATH = MAIN_PATH + 'test_normalized.csv'

In [219]:
df_train = pd.read_csv(TRAIN_PATH, index_col=0)
negative_mask = df_train['label'] == 0
positive_mask = df_train['label'] == 1

In [224]:
sample_size = 29
df_train_sampled = pd.concat([
        df_train[negative_mask].sample(sample_size),
        df_train[positive_mask].sample(sample_size)
    ])

In [229]:
df_train_sampled = df_train_sampled.sample(frac=1).reset_index(drop=True)

In [232]:
y_train_sampled = df_train_sampled[['label']].to_numpy().reshape(-1,)
x_train_sampled = df_train_sampled.drop(columns=['label']).to_numpy()

In [235]:
rf_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3,6,9],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}
result = grid_search(RandomForestClassifier(), x_train_sampled, y_train_sampled, rf_grid)
result.head()

Unnamed: 0,params,mean_test_score,std_test_score
11,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 150}",0.842424,0.070646
0,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 50}",0.825758,0.060226
9,"{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 3, 'n_estimators': 50}",0.825758,0.060226
19,"{'bootstrap': False, 'criterion': 'gini', 'max_depth': 3, 'n_estimators': 100}",0.825758,0.060226
5,"{'bootstrap': True, 'criterion': 'gini', 'max_depth': 6, 'n_estimators': 150}",0.825758,0.101187
