In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
import csv
import timeit
import datetime
import sys

In [2]:
X_TRAIN_FILE = '../X_train_without_biochem.pkl'
X_TEST_FILE = '../X_test_without_biochem.pkl'
Y_TRAIN_FILE = '../y_train_without_biochem.pkl'
Y_TEST_FILE = '../y_test_without_biochem.pkl'
METRICS_FILE = 'metrics.txt'

In [3]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

c:\users\richard\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py started at 2018-06-05 15:37:08.169414


In [None]:
# Generate the rubric dictionary
print('Generating rubric dictionary...', end='')
start_time = timeit.default_timer()
in_read_file = open('../input/large_anon_test_records_for_sharing.csv', mode='r')
out_read_file = open('../input/large_anon_test_records_for_sharing.csv', mode='r')
in_read_csv = csv.reader(in_read_file)
out_read_csv = csv.reader(out_read_file)
in_rubrics = {row[4]: row[3] for row in in_read_csv}
out_rubrics = {row[4]: row[3] for row in out_read_csv}
rubrics = {**in_rubrics, **out_rubrics}
in_read_file.close()
out_read_file.close()

def get_rubric(read_code):    
    return rubrics.get(read_code, 'unknown')
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

In [4]:
# Get a list of all patients for train-test splitting
print('Reading data...', end='')
start_time = timeit.default_timer()
X_train, X_test = pd.read_pickle(X_TRAIN_FILE), pd.read_pickle(X_TEST_FILE)
y_train, y_test = pd.read_pickle(Y_TRAIN_FILE), pd.read_pickle(Y_TEST_FILE)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)
print(y_test.value_counts())

Reading data... done in 0.05s
False    502
True     409
Name: label, dtype: int64


In [5]:
print('Performing grid search and training model...', end='')
start_time = timeit.default_timer()
# param_grid = [
#     {'C': np.logspace(-2, 10, 13), 'kernel': ['linear']}
# ]
param_grid = [
    {'C': [1.00], 'kernel': ['linear']}
]
grid = GridSearchCV(SVC(), param_grid=param_grid, n_jobs=16, verbose=50, cv=10)
grid.fit(X_train, y_train)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)
print('The best parameters are {} with a score of {}'.format(grid.best_params_, grid.best_score_))

print('Making predictions...', end='')
start_time = timeit.default_timer()
train_preds = grid.predict(X_train)
test_preds = grid.predict(X_test)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

train_accuracy = accuracy_score(y_train, train_preds)
test_accuracy = accuracy_score(y_test, test_preds)
print('Train accuracy: {:.2f} Test accuracy: {:.2f}'.format(train_accuracy * 100, test_accuracy * 100.0))

classification_report = classification_report(y_test, test_preds)
print('Classification report:\n ', classification_report)

confusion_matrix = confusion_matrix(y_test, test_preds)
print('Confusion matrix:\n ', confusion_matrix)

with open(METRICS_FILE, mode='w') as file:
    file.write('Train accuracy: {:.2f} Test accuracy: {:.2f}\n'.format(train_accuracy * 100, test_accuracy * 100.0))
    file.write('Classification report: {}\n'.format(classification_report))
    file.write('Confusion matrix: {}\n'.format(confusion_matrix))
    file.write('Cross validation results: {}\n'.format(grid.cv_results_))
print('Metrics saved at {}'.format(METRICS_FILE))

Performing grid search and training model...Fitting 3 folds for each of 1 candidates, totalling 3 fits
[Parallel(n_jobs=16)]: Done   1 tasks      | elapsed:  1.1min
[Parallel(n_jobs=16)]: Done   3 out of   3 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=16)]: Done   3 out of   3 | elapsed:  1.2min finished
 done in 128.21s
The best parameters are {'C': 1.0, 'kernel': 'linear'} with a score of 0.49366064202859455
Making predictions... done in 13.31s
Accuracy: 54.12
Classification report:
               precision    recall  f1-score   support

      False       0.58      0.62      0.60       502
       True       0.49      0.44      0.46       409

avg / total       0.54      0.54      0.54       911

Confusion matrix:
  [[313 189]
 [229 180]]
Metrics saved at metrics.txt


In [6]:
print('Sorting importance...', end='')
start_time = timeit.default_timer()
svm = grid.best_estimator_
feature_dict = {get_rubric(f): i for f, i in zip(X_test.columns, *svm.coef_)}

importances = pd.DataFrame.from_dict(feature_dict, orient='index').rename(columns={0: 'importance'})

importances.sort_values(by='importance', inplace=True, ascending=False)
importances.to_csv('importances_without_biochem.csv')
importances.to_pickle('importances_without_biochem.pkl')

Sorting importance...

In [7]:
script_end_time = datetime.datetime.now()
print('{} completed at {}'.format(
    sys.argv[0], 
    script_end_time)
)
print('Total time: {}'.format(script_end_time - script_start_time))

c:\users\richard\appdata\local\programs\python\python35\lib\site-packages\ipykernel_launcher.py completed at 2018-06-05 15:39:30.343357
Total time: 0:02:22.173943
