In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

data = pd.read_csv("data/38_feature.csv",index_col=0)
data_norm = pd.read_csv("data/38_feature_norm.csv",index_col=0)
data_std_scale = pd.read_csv("data/38_feature_std_scale.csv",index_col=0)

X = data.iloc[:,2:]
y2 = data['class2']
y4 = data['class4']
X_norm = data_norm.iloc[:,2:]
X_std_scale = data_std_scale.iloc[:,2:]

## 2-class

In [2]:
# define dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y2, stratify=y2 ,test_size=0.20, random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.849800 using {'max_features': 'sqrt', 'n_estimators': 100}


In [4]:
clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=100)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.8913043478260869

#### Test PCA

In [4]:
PCA_2 = pd.read_csv('data/PCA_2_components_train.csv')
PCA_2_X = PCA_2.iloc[:,3:]
PCA_2_X

PCA_5 = pd.read_csv('data/PCA_5_components_train.csv')
PCA_5_X = PCA_5.iloc[:,3:]
PCA_5_X
PCA_5_X_norm = (PCA_5_X - PCA_5_X.min())/(PCA_5_X.max()-PCA_5_X.min())

PCA_10 = pd.read_csv('data/PCA_10_components_train.csv')
PCA_10_X = PCA_10.iloc[:,3:]
PCA_10_X
PCA_10_X_norm = (PCA_10_X - PCA_10_X.min())/(PCA_10_X.max()-PCA_10_X.min())

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    PCA_10_X_norm, y2, stratify = y2,test_size=0.20, random_state=42)

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.809885 using {'max_features': 'sqrt', 'n_estimators': 1000}


In [5]:
clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=100)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.8369565217391305

## 4-class

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X_norm, y4, stratify=y4 ,test_size=0.20, random_state=42)


In [6]:

model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))


Best: 0.647172 using {'max_features': 'sqrt', 'n_estimators': 1000}


In [7]:
clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=1000)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.6521739130434783

### Test PCA

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    PCA_10_X_norm, y4, stratify = y4,test_size=0.20, random_state=42)

model = RandomForestClassifier()
n_estimators = [10, 100, 1000]
max_features = ['sqrt', 'log2']
# define grid search
grid = dict(n_estimators=n_estimators,max_features=max_features)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X_train, y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Best: 0.622948 using {'max_features': 'sqrt', 'n_estimators': 1000}


In [7]:
clf = RandomForestClassifier(max_features = 'sqrt',n_estimators=1000)
clf.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
accuracy_score(clf.predict(X_test),y_test)

0.6304347826086957