In [1]:
import numpy as np

save_file = 'data.npy'
with open(save_file, 'rb') as f:
    X = np.load(f)
    Y = np.load(f)

In [2]:
X = X.reshape(27000, -1)

In [3]:
Y.shape, X.shape

((27000,), (27000, 53248))

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [5]:
del X
del Y

In [14]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(X_train[:], Y_train[:])

DecisionTreeClassifier()

In [15]:
y_pred = clf.predict(X_test)

In [16]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.6403703703703704


In [20]:
import pickle

with open('model_decision_tree_new.pickle', 'wb') as f:
    pickle.dump(clf, f)

In [25]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [35]:
clf.fit(X_train[:], Y_train[:])

RandomForestClassifier()

In [36]:
y_pred = clf.predict(X_test)

In [37]:
print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.8316666666666667


In [38]:
with open('model_random_forest.pickle', 'wb') as f:
    pickle.dump(clf, f)

In [39]:
with open('model_random_forest.pickle', 'rb') as f:
    forest = pickle.load(f)

In [40]:
y_pred = forest.predict(X_test)

In [41]:
print("Accuracy:", metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.8316666666666667


# Random search parameters

In [6]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

n_estimators = [50, 100, 150, 200, 250, 500, 1000]
max_features = ['auto', 'sqrt']
max_depth = [3, 5, 10, 20, 30, 50]
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier(random_state=42)

print(random_grid)

{'n_estimators': [50, 100, 150, 200, 250, 500, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [3, 5, 10, 20, 30, 50], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [7]:
r_search = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

In [8]:
r_search.fit(X_train[:1000], Y_train[:1000])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


KeyboardInterrupt: 

In [11]:
r_search.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 30,
 'bootstrap': False}

In [22]:
clf = RandomForestClassifier(**r_search.best_params_, random_state=42, n_jobs = 11)

In [23]:
clf

RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=4,
                       n_estimators=1000, n_jobs=11, random_state=42)

In [24]:
clf.fit(X_train[:], Y_train[:])

RandomForestClassifier(bootstrap=False, max_depth=30, min_samples_leaf=4,
                       n_estimators=1000, n_jobs=11, random_state=42)

In [25]:
y_pred = clf.predict(X_test)

In [27]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(Y_test, y_pred))

Accuracy: 0.845


In [29]:
import pickle

with open('model_random_forest_r_searched_84,5%.pickle', 'wb') as f:
    pickle.dump(clf, f)