In [2]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, StratifiedShuffleSplit
import sys
    
packed = np.genfromtxt('features/features_packed.csv', delimiter=',')
packed = np.delete(packed,0,0)
packed = np.delete(packed,0,1)
y1 = np.ones(len(packed))

packed_train,packed_test = np.split(packed,[int(0.7 * len(packed))])
y1_train,y1_test = np.split(y1,[int(0.7 * len(y1))])

notpacked = np.genfromtxt('features/features_notpacked.csv', delimiter=',')
notpacked = np.delete(notpacked,0,0)
notpacked = np.delete(notpacked,0,1)
y0 = np.zeros(len(notpacked))

notpacked_train,notpacked_test = np.split(notpacked,[int(0.7 * len(notpacked))])
y0_train,y0_test = np.split(y0,[int(0.7 * len(y0))])

X = np.append(packed_train,notpacked_train, axis = 0)
y = np.append(y1_train,y0_train, axis = 0)

X1 = np.append(packed_test,notpacked_test, axis = 0)
y1 = np.append(y1_test,y0_test, axis = 0)

In [2]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)

The best parameters are {'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 2} with a score of 0.99065 in training and a score of 0.99563 in testing


In [3]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)

The best parameters are {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 2, 'n_estimators': 15} with a score of 1.00000 in training and a score of 1.00000 in testing


In [4]:
param_grid = {"criterion":["friedman_mse","squared_error"],"loss": ["log_loss","exponential"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)

The best parameters are {'criterion': 'friedman_mse', 'loss': 'exponential', 'max_depth': 2, 'min_samples_leaf': 2, 'n_estimators': 36} with a score of 0.99583 in training and a score of 1.00000 in testing


In [5]:
packed = np.genfromtxt('features/static_features_packed.csv', delimiter=',')
packed = np.delete(packed,0,1)
y1 = np.ones(len(packed))

packed_train,packed_test = np.split(packed,[int(0.7 * len(packed))])
y1_train,y1_test = np.split(y1,[int(0.7 * len(y1))])

notpacked = np.genfromtxt('features/static_features_notpacked.csv', delimiter=',')
notpacked = np.delete(notpacked,0,1)
y0 = np.zeros(len(notpacked))

notpacked_train,notpacked_test = np.split(notpacked,[int(0.7 * len(notpacked))])
y0_train,y0_test = np.split(y0,[int(0.7 * len(y0))])

X = np.append(packed_train,notpacked_train, axis = 0)
y = np.append(y1_train,y0_train, axis = 0)

X1 = np.append(packed_test,notpacked_test, axis = 0)
y1 = np.append(y1_test,y0_test, axis = 0)

In [6]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12]}
dt = DecisionTreeClassifier(random_state=0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=dt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)

The best parameters are {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 4} with a score of 0.95605 in training and a score of 0.92959 in testing


In [None]:
param_grid =  {"criterion": ["gini","entropy"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
rf = RandomForestClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=rf, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)

In [None]:
param_grid = {"criterion":["friedman_mse","squared_error"],"loss": ["log_loss","exponential"],"min_samples_leaf":[2,3,4,5,6,7,8,9,10,11,12],"max_depth":[1,2,3,4,5,6,7,8,9,10,11,12],"n_estimators" : [i for i in range(10,40)]}
gbdt = GradientBoostingClassifier(random_state = 0)
cv = StratifiedKFold(n_splits=10)
search = GridSearchCV(estimator=gbdt, param_grid=param_grid, cv = cv, scoring = "balanced_accuracy", n_jobs = -1)
search.fit(X, y)
print(
    "The best parameters are %s with a score of %0.5f in training and a score of %0.5f in testing"
    % (search.best_params_, search.best_score_,search.best_estimator_.score(X1,y1))
)