# Machine Learning

The following notebook shows sample codes of logistic regression, random forest and XG Boost.  Cross validation grid search is used in random forest and XG Boost to optimize the models.

## 1. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV

In [None]:
# A function to get train scores
def getTrainScores(gs):
    results = {}
    runs = 0
    for x,y in zip(list(gs.cv_results_['mean_test_score']), gs.cv_results_['params']):
        results[runs] = 'mean:' + str(x) + 'params' + str(y)
        runs += 1
    best = {'best_mean': gs.best_score_, "best_param":gs.best_params_}
    return results, best

In [None]:
# define model
model = LogisticRegression()
# define the ovr strategy
ovr = OneVsRestClassifier(model)
# fit model
ovr.fit(training_data['X_train'], training_data['Y_train'].reshape(training_data['Y_train'].shape[0],))
# make predictions
yhat = ovr.predict(training_data['X_test'])
accuracy_score(training_data['Y_test'], yhat)
confusion_matrix(training_data['Y_test'], yhat)

In [None]:
train_pred = ovr.predict(training_data['X_train'])
print('Train Accuracy:'+str(accuracy_score(training_data['Y_train'], train_pred)))
print('Train F1-Score(Macro):'+str(f1_score(training_data['Y_train'], train_pred,average='macro')))
print('------')
print('Test Accuracy:'+str(accuracy_score(training_data['Y_test'], yhat))) 
print('Test F1-Score(Macro):'+str(f1_score(training_data['Y_test'], yhat,average='macro')))

## 2. Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_jobs=None,random_state=27, verbose=1)
clf.fit(training_data['X_train'], training_data['Y_train'].reshape(training_data['Y_train'].shape[0],))
predicted_labels = clf.predict(training_data['X_test'])
accuracy_score(training_data['Y_test'], predicted_labels)
train_pred = clf.predict(training_data['X_train'])

print('Train Accuracy:'+str(accuracy_score(training_data['Y_train'], train_pred))) 
print('Train F1-Score(Macro):'+str(f1_score(training_data['Y_train'], train_pred,average='macro')))
print('------')
print('Test Accuracy:'+str(accuracy_score(training_data['Y_test'], predicted_labels))) 
print('Test F1-Score(Macro):'+str(f1_score(training_data['Y_test'], predicted_labels,average='macro')))

In [None]:
# CV Search
params = {
    'n_estimators'      : [100, 500, 900],
    'max_depth'         : [1, 5, 9],
    'max_features': ['auto'],
    'criterion' :['gini']
}
#metrics to consider: f1_micro, f1_macro, roc_auc_ovr
gsearch1 = GridSearchCV(estimator = clf, param_grid = params, scoring='f1_micro',n_jobs=-1,verbose = 10, cv=10)
gsearch1.fit(training_data['X_train'], training_data['Y_train'].reshape(training_data['Y_train'].shape[0],))

gsearch1.best_estimator_ 
getTrainScores(gsearch1)
predicted_labels = gsearch1.predict(training_data['X_test'])
accuracy_score(training_data['Y_test'], predicted_labels)

In [None]:
#Feature Importance Plot
import matplotlib.pyplot as plt
import seaborn as sns

features = TestSet5.columns

f, ax = plt.subplots(figsize=(10,5))
plot = sns.barplot(x=features, y=final_clf.feature_importances_)
ax.set_title('Feature Importance')
plot.set_xticklabels(plot.get_xticklabels(),rotation='vertical')
plt.show()

## 3. XG Boost

In [None]:
# A function displaying the results
def fitXgb(sk_model, training_data=training_data, epochs=100):
    sk_model.fit(training_data['X_train'], training_data['Y_train'].reshape(training_data['Y_train'].shape[0], ))
    train = xgb.DMatrix(training_data['X_train'], label=training_data['Y_train'])
    params = sk_model.get_xgb_params()
    metrics = ['mlogloss', 'merror']
    params['eval_metric'] = metrics
    store = {}
    evallist = [(train, 'train')]
    xgb_model = xgb.train(params, train, epochs, evallist, evals_result=store, verbose_eval=100)
    print('-- Model Report --')
    print(
        'XGBoost Accuracy: ' + str(accuracy_score(sk_model.predict(training_data['X_test']), training_data['Y_test'])))
    print('XGBoost F1-Score (Macro): ' + str(
        f1_score(sk_model.predict(training_data['X_test']), training_data['Y_test'], average='macro')))


In [None]:
# XGBoost hyperparameter tuning with GridSearchCV
from xgboost.sklearn import XGBClassifier
#initial model
xgb1 = XGBClassifier(learning_rate=0.1,
                    n_estimators=100,
                    max_depth=7,
                    min_child_weight=7,
                    gamma=0,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective='multi:softmax',
                    nthread=4,
                    num_class=9,
                    seed=27)

fitXgb(xgb1, training_data)

param_test1 = {
 'max_depth':[1, 5, 9],
 'min_child_weight':[1, 5, 9],
 'n_estimators': range(100, 1000, 100)
}
#metrics to consider: f1_micro, f1_macro, roc_auc_ovr
gsearch1 = GridSearchCV(estimator = xgb1, param_grid = param_test1, scoring='f1_macro',n_jobs=-1,verbose = 10, cv=3)
gsearch1.fit(training_data['X_train'], training_data['Y_train'])

getTrainScores(gsearch1) 