In [3]:
import pandas as pd
import numpy as np

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn import cross_validation
from sklearn.grid_search import GridSearchCV

In [4]:
def get_data(X_train_file, y_train_file, X_test_file, y_test_file):
    # Reads train & test features and labels data from files and returnes them as numpy arrays

    X_train_df = pd.read_csv(X_train_file)
    y_train_df = pd.read_csv(y_train_file)
    X_test_df = pd.read_csv(X_test_file)
    y_test_df = pd.read_csv(y_test_file);

    n_train_samples = X_train_df.shape[0]
    n_test_samples = X_test_df.shape[0]
    n_features = X_train_df.shape[1]

    X_train = np.array(X_train_df).reshape((n_train_samples,n_features))
    y_train = np.array(y_train_df).reshape(n_train_samples,)    
    X_test = np.array(X_test_df).reshape((n_test_samples,n_features))
    y_test = np.array(y_test_df).reshape(n_test_samples,)

    return X_train, y_train, X_test, y_test

In [6]:
def get_scores(y_test,y_pred):
    # Reads labels and predictions and gives accuracy, precision, recall & confusion matrix

    cm = confusion_matrix(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)

    prec = np.around(np.diag(cm).astype(float)*100/cm.sum(axis = 0), decimals =2)
    rec = np.around(np.diag(cm).astype(float)*100/cm.sum(axis = 1), decimals =2)

    cm_full = np.vstack((cm,prec))  # adding precision row 
    cm_full = np.hstack((cm_full,(np.append(rec,np.around(acc*100,decimals=2))).reshape(len(cm_full),1))) # adding recall column & total accuracy


    print 'Accuracy: ', np.around(acc*100,decimals=2)
    print 'Precision: ', np.around(prec*100,decimals=2)
    print 'Recall: ', np.around(rec*100,decimals=2)
    print 'Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying'
    print cm
    print 'Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)'
    print cm_full  

    return acc, prec, rec, cm, cm_full

In [7]:
def do_grid_search(est, parameters, X_train, y_train):
    # Reads estimator and it's parameters and gives the best parameters

    nfolds = 10
    skf = cross_validation.StratifiedKFold(y_train, n_folds = nfolds, random_state=42)
    gs_clf = GridSearchCV(est, parameters, cv = skf, n_jobs = -1)
    gs_clf.fit(X_train, y_train)
    return gs_clf.best_score_, gs_clf.best_params_

In [50]:
def do_svc(X_train, y_train, X_test, svc_parameters):
    # Read data and give SVC prediction for th best parameters

    svc = svm.SVC()
    svc_best_score, svc_best_params = do_grid_search(svc, svc_parameters, X_train, y_train)
    print 'SVC best score is: ', svc_best_score
    print 'SVC best parameters are: ', svc_best_params

    svc_opt = svm.SVC(C=svc_best_params['C'], kernel=svc_best_params['kernel'], degree=3, gamma=svc_best_params['gamma'], 
            coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
            max_iter=-1, random_state=None)
    svc_opt.fit(X_train, y_train)
    return svc_opt.predict(X_test)

In [5]:
 X_train, y_train, X_test, y_test = get_data('X_train_header.txt', 'y_train_header.txt',
                                             'X_test_header.txt', 'y_test_header.txt')   

In [9]:
C_range = [0.1, 10]
gamma_range = [1.0000e-08, 1.0000]                           
svc_parameters = {'kernel':('linear', 'rbf'), 'C':C_range, 'gamma': gamma_range}

In [13]:
y_pred_svc = do_svc(X_train, y_train, X_test, svc_parameters)

SVC best score is:  0.947633297062
SVC best parameters are:  {'kernel': 'linear', 'C': 10, 'gamma': 1e-08}


NameError: global name 'svm_best_params' is not defined

In [16]:
svc_best_params = {'kernel': 'linear', 'C': 10, 'gamma': 1e-08}

In [19]:
    svc_opt = svm.SVC(C=svc_best_params['C'], kernel=svc_best_params['kernel'], degree=3, gamma=svc_best_params['gamma'], 
            coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False,
            max_iter=-1, random_state=None)
    svc_opt.fit(X_train, y_train)
    y_pred_svc = svc_opt.predict(X_test)

In [21]:
    svc_acc, svc_prec, svc_rec, svc_cm, svc_cm_full = get_scores(y_test,y_pred_svc)
    print 'SVC Accuracy: ', np.around(svc_acc*100,decimals=2)
    print 'SVC Precision: ', np.around(svc_prec*100,decimals=2)
    print 'SVC Recall: ', np.around(svc_rec*100,decimals=2)
    print 'SVC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying'
    print svc_cm
    print 'SVC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)'
    print svc_cm_full 

SVC Accuracy:  96.27
SVC Precision:  96.36
SVC Recall:  96.27
SVC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying
[[492   1   3   0   0   0]
 [ 18 451   2   0   0   0]
 [  4   6 410   0   0   0]
 [  0   3   0 432  56   0]
 [  0   0   0  17 515   0]
 [  0   0   0   0   0 537]]
SVC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)
[[ 492.      1.      3.      0.      0.      0.     99.19]
 [  18.    451.      2.      0.      0.      0.     95.75]
 [   4.      6.    410.      0.      0.      0.     97.62]
 [   0.      3.      0.    432.     56.      0.     87.98]
 [   0.      0.      0.     17.    515.      0.     96.8 ]
 [   0.      0.      0.      0.      0.    537.    100.  ]
 [  95.72   97.83   98.8    96.21   90.19  100.     96.27]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [10]:
def do_rfc(X_train, y_train, X_test, rfc_parameters):
    # Read data and give RFC prediction for th best parameters

    rfc = RandomForestClassifier()
    rfc_best_score, rfc_best_params = do_grid_search(rfc, rfc_parameters, X_train, y_train)
    print 'RFC best parameters are: ', rfc_best_params

    rfc_opt = RandomForestClassifier(n_estimators=rfc_best_params['n_estimators'], criterion=rfc_best_params['criterion'], 
            max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
            max_features=rfc_best_params['max_features'], max_leaf_nodes=None, bootstrap=True, oob_score=False, n_jobs=1, 
            random_state=None, verbose=0, warm_start=False, class_weight=None)
    rfc_opt.fit(X_train, y_train)

    feature_importances = np.argsort(rfc_opt.feature_importances_) 
    print '10 most importantant features are with column numbers: ', feature_importances[-1:-11:-1] # reverse order

    return rfc_opt.predict(X_test)


In [45]:
    n_estimators_range = [5, 1000]
    max_features_range = [2, 9]    # log2(nfeatures=562) = 24, sqrt(561) = 9     
    #max_depth_range = []
    rfc_parameters = {'criterion':('gini', 'entropy'), 'n_estimators': n_estimators_range, 'max_features': max_features_range}

In [46]:
    y_pred_rfc = do_rfc(X_train, y_train, X_test, rfc_parameters)

RFC best parameters are:  {'max_features': 9, 'n_estimators': 1000, 'criterion': 'gini'}
10 most importantant features are with column numbers:  [ 52  56 558  40  53  49  41  50 559  57]


In [48]:
    rfc_acc, rfc_prec, rfc_rec, rfc_cm, rfc_cm_full = get_scores(y_test,y_pred_rfc)
    print 'RFC Accuracy: ', np.around(rfc_acc*100,decimals=2)
    print 'RFC Precision: ', np.around(rfc_prec*100,decimals=2)
    print 'RFC Recall: ', np.around(rfc_rec*100,decimals=2)
    print 'RFC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying'
    print rfc_cm
    print 'RFC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)'
    print rfc_cm_full

RFC Accuracy:  94.16
RFC Precision:  94.32
RFC Recall:  94.16
RFC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying
[[485   2   9   0   0   0]
 [ 27 438   6   0   0   0]
 [ 22  47 351   0   0   0]
 [  0   0   0 444  46   1]
 [  0   0   0  12 520   0]
 [  0   0   0   0   0 537]]
RFC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)
[[ 485.      2.      9.      0.      0.      0.     97.78]
 [  27.    438.      6.      0.      0.      0.     92.99]
 [  22.     47.    351.      0.      0.      0.     83.57]
 [   0.      0.      0.    444.     46.      1.     90.43]
 [   0.      0.      0.     12.    520.      0.     97.74]
 [   0.      0.      0.      0.      0.    537.    100.  ]
 [  90.82   89.94   95.9    97.37   91.87   99.81   94.16]]


  sample_weight=sample_weight)
  sample_weight=sample_weight)


In [49]:
    C_range = [0.1, 0.316, 1, 3.16, 10, 31.6, 100, 316, 1000, 3160, 10000]
    gamma_range = [1.0000e-08, 6.3096e-08, 3.9811e-07, 2.5119e-06, 1.5849e-05, 0.0001, 0.00063096,
                            0.0039811, 0.025119, 0.15849, 1.0000]                           
    svc_parameters = {'kernel':('linear', 'rbf'), 'C':C_range, 'gamma': gamma_range}

In [51]:
y_pred_svc = do_svc(X_train, y_train, X_test, svc_parameters)

SVC best score is:  0.957698585419
SVC best parameters are:  {'kernel': 'rbf', 'C': 10, 'gamma': 0.025119}


In [52]:
    svc_acc, svc_prec, svc_rec, svc_cm, svc_cm_full = get_scores(y_test,y_pred_svc)
    print 'SVC Accuracy: ', np.around(svc_acc*100,decimals=2)
    print 'SVC Precision: ', np.around(svc_prec*100,decimals=2)
    print 'SVC Recall: ', np.around(svc_rec*100,decimals=2)
    print 'SVC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying'
    print svc_cm
    print 'SVC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)'
    print svc_cm_full 

  sample_weight=sample_weight)
  sample_weight=sample_weight)


SVC Accuracy:  96.34
SVC Precision:  96.44
SVC Recall:  96.34
SVC Confusion Matrix (Activities: Walking, Upstairs, Downstairs, Standing, Sitting, Laying
[[486   5   5   0   0   0]
 [ 12 458   1   0   0   0]
 [  5  29 386   0   0   0]
 [  0   2   0 449  40   0]
 [  0   0   0   9 523   0]
 [  0   0   0   0   0 537]]
SVC Confusion Matrix & Scores (Actual Activities & Precision vs. Predicted Activies & Recall; Total Accuracy)
[[ 486.      5.      5.      0.      0.      0.     97.98]
 [  12.    458.      1.      0.      0.      0.     97.24]
 [   5.     29.    386.      0.      0.      0.     91.9 ]
 [   0.      2.      0.    449.     40.      0.     91.45]
 [   0.      0.      0.      9.    523.      0.     98.31]
 [   0.      0.      0.      0.      0.    537.    100.  ]
 [  96.62   92.71   98.47   98.03   92.9   100.     96.34]]


In [None]:
    n_estimators_range = [5, 10, 25, 50, 100, 500, 1000]
    max_features_range = [2, 5, 9, 24]    # log2(nfeatures=561) = 9, sqrt(561) = 24     
    max_depth_range = [6, 8, 10, 20]
    rfc_parameters = {'criterion':('gini', 'entropy'), 'n_estimators': n_estimators_range, 'max_features': max_features_range,
                     'max_depth_range': max_depth_range}

In [8]:
n_estimators_range = [1000, 5000, 10000]
max_features_range = [8, 9, 10, 15, 24]    # log2(nfeatures=562) = 9, sqrt(561) = 24    
rfc_parameters = {'criterion':('gini', 'entropy'), 'n_estimators': n_estimators_range, 'max_features': max_features_range}

In [None]:
y_pred_rfc = do_rfc(X_train, y_train, X_test, rfc_parameters)