In [5]:
import pandas as pd
import random
import numpy as np
import matplotlib.pyplot as plt
from random import seed
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 8
import sklearn 
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import sklearn.model_selection 
from sklearn import cross_validation, metrics

In [9]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [2]:
train_data = pd.read_csv('train_data.csv')
test_data = pd.read_csv('test_data.csv')

In [23]:
# print train_data.head()
print train_data.shape
print train_data['target'].value_counts()

(169307, 43)
0    98868
2    36854
1    33585
Name: target, dtype: int64


In [4]:
target = 'target'
id_col = 'connection_id'

In [13]:
def modelfit(alg, dtrain, dtest, predictors, performCV=True, printFeatureImportance=True, cv_folds=5):
    
    alg.fit(dtrain[predictors], dtrain['target'])
    dtrain_pred = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
    
    cv_score = cross_validation.cross_val_score(alg, dtrain[predictors], dtrain['target'], cv = cv_folds, scoring = 'roc_auc')
    cv_score = np.sqrt(np.abs(cv_score))
    
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predicts)
    print "AUC Score (Train): %.4g" % metrics.roc_auc_score(dtrain['target'], dtrain_predprob)
    
    if performCV:
        print "CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" %(np.mean(cv_score), np.std(cv_score), np.min(cv_score), np.max(cv_score))
    
    if printFeatureImportance:
        coeff_rf_1 = pd.Series(alg_rf_1.feature_importances_, predictors).sort_values(ascending = False)
        coeff_rf_1.plot(kind = 'bar', title = 'Feature Importances')
               
    
#     idcol.append(target)
#     submission = pd.DataFrame({x : dtest[x] for x in idcol})
#     submission.to_csv(filename, index = False)

# Variable Importance plot (Random Forest Algorithm)

In [None]:
seed(100)
predictors = [x for x in train_data.columns if x not in [target, id_col]]
alg_rf_1 = RandomForestClassifier(n_estimators=200,max_depth=5, min_samples_leaf=100,n_jobs=4)
modelfit(alg_rf_1, train_data, test_data, predictors, target, id_col, 'alg_rf_1.csv')
coeff_rf_1 = pd.Series(alg_rf_1.feature_importances_, predictors).sort_values(ascending = False)
coeff_rf_1.plot(kind = 'bar', title = 'Feature Importances')

# Grid Search Cross-Validation 

In [6]:
from sklearn.model_selection import GridSearchCV

In [12]:
predictors = [x for x in train_data.columns if x not in [target, id_col]]
param_grids = {'max_depth' : range(5, 16, 2)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 200), param_grid = param_grids)

In [15]:
gsearch1.fit(train_data[predictors], train_data[target])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [5, 7, 9, 11, 13, 15]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [18]:
print gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

[mean: 0.77835, std: 0.00020, params: {'max_depth': 5}, mean: 0.77981, std: 0.00018, params: {'max_depth': 7}, mean: 0.78051, std: 0.00007, params: {'max_depth': 9}, mean: 0.78075, std: 0.00011, params: {'max_depth': 11}, mean: 0.78068, std: 0.00008, params: {'max_depth': 13}, mean: 0.78050, std: 0.00017, params: {'max_depth': 15}] {'max_depth': 11} 0.780747399694




In [19]:
param_grids_2 = {'min_samples_split' : range(200, 2000, 200)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 200, max_depth = 11), param_grid = param_grids_2)
gsearch2.fit(train_data[predictors], train_data[target])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [20]:
print gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

[mean: 0.78028, std: 0.00015, params: {'min_samples_split': 200}, mean: 0.77957, std: 0.00012, params: {'min_samples_split': 400}, mean: 0.77886, std: 0.00041, params: {'min_samples_split': 600}, mean: 0.77861, std: 0.00014, params: {'min_samples_split': 800}, mean: 0.77834, std: 0.00021, params: {'min_samples_split': 1000}, mean: 0.77797, std: 0.00020, params: {'min_samples_split': 1200}, mean: 0.77735, std: 0.00060, params: {'min_samples_split': 1400}, mean: 0.77745, std: 0.00012, params: {'min_samples_split': 1600}, mean: 0.77659, std: 0.00078, params: {'min_samples_split': 1800}] {'min_samples_split': 200} 0.780280791698




In [21]:
param_grids_3 = {'max_features' : range(5, 40, 5)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators = 200, max_depth = 11, min_samples_split = 200), param_grid = param_grids_3)
gsearch3.fit(train_data[predictors], train_data[target])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=11, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=200, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_features': [5, 10, 15, 20, 25, 30, 35]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [22]:
print gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

[mean: 0.78006, std: 0.00002, params: {'max_features': 5}, mean: 0.78063, std: 0.00004, params: {'max_features': 10}, mean: 0.78069, std: 0.00004, params: {'max_features': 15}, mean: 0.78065, std: 0.00008, params: {'max_features': 20}, mean: 0.78063, std: 0.00006, params: {'max_features': 25}, mean: 0.78040, std: 0.00028, params: {'max_features': 30}, mean: 0.77989, std: 0.00041, params: {'max_features': 35}] {'max_features': 15} 0.780688335391




In [29]:
modelfit(gsearch3.best_estimator_, train_data, test_data, predictors, target, id_col, 'alg_rf_2.csv')


Model Report
RMSE : 0.9328
CV Score : Mean - 0.9329 | Std - 0.0004645 | Min - 0.9317 | Max - 0.9335


# Xtreme Boosting Classifier

In [33]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [38]:
target_le = le.transform(train_data['target'])
print target_le

[2 0 0 ..., 1 0 1]


In [30]:
test_results = pd.read_csv('test_data.csv')
def modelfit1(alg, dtrain, dtest, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
#         xgb_param['num_class'] = 3
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=target_le)
        xgtest = xgb.DMatrix(dtest[predictors].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='mlogloss', early_stopping_rounds=early_stopping_rounds, show_stdv=True)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['target'])
#     ,eval_metric='mlogloss')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print "\nModel Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(dtrain['target'].values, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['target'], dtrain_predprob)
    
#     Predict on testing data:
    dtest['predprob'] = alg.predict_proba(dtest[predictors])[:,1]
    results = test_results.merge(dtest[['connection_id','predprob']], on='connection_id')
    print 'AUC Score (Test): %f' % metrics.roc_auc_score(results['target'], results['predprob'])
                
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [31]:
predictors = [x for x in train_data.columns if x not in [target, id_col]]
xgb1 = XGBClassifier(learning_rate = 0.1
                    , n_estimators = 100 
                    , objective = 'multi:softprob'
                    , max_depth = 5
                    , min_child_weight = 1
                    , gamma = 0
                    , subsample = 0.8
                    , colsample_bytree = 0.8
                    , nthread = 4
                    , scale_pos_weight = 1
                    , seed = 10)
modelfit1(xgb1, train_data, test_data, predictors)


Model Report
Accuracy : 0.7815


ValueError: multiclass format is not supported