In [11]:
#!/bin/python

import numpy as np
import pandas as pd
import os
import pickle
import sys
import scipy
from pathlib import Path
from collections import Counter
import random
import copy

# Machine Learning libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import average_precision_score

s_trn_file_path = "../surf_bow/surf_400_trn.csv"
s_val_file_path = "../surf_bow/surf_400_val.csv"
s_test_file_path = "../surf_bow/surf_400_test.csv"

#s_trn_file_path = "../cnn_bow/cnn_trn.csv"
#s_val_file_path = "../cnn_bow/cnn_val.csv"
#s_test_file_path = "../cnn_bow/cnn_test.csv"

In [12]:
s_train_df = pd.read_csv(s_trn_file_path, index_col='Unnamed: 0')
s_train_df.drop(['name'], axis=1, inplace=True )

train_df = s_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

##

s_test_df = pd.read_csv(s_val_file_path, index_col='Unnamed: 0')
s_test_df.drop(['name'], axis=1, inplace=True )

test_df = s_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
test_df_target = pd.DataFrame(test_df['target'], columns=['target'])
test_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(test_df)
X_train_tf = tf_transformer.transform(test_df)
test_df = pd.DataFrame(X_train_tf.todense(), columns=test_df.columns.values)

# 3. Add back the target column
test_df = pd.concat([test_df, test_df_target], axis=1)

##

# Machine Learning
prediction_var = list(train_df.columns)
prediction_var.remove('target')

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

print(train_X.shape)
print(train_X.head())
print(train_y.shape)
print(train_y.head())

(834, 400)
         S0        S1        S2        S3        S4        S5        S6  \
0  0.001223  0.001459  0.027367  0.018513  0.003287  0.001316  0.138583   
1  0.014347  0.003422  0.047592  0.063718  0.029106  0.010969  0.062388   
2  0.011718  0.006722  0.039280  0.019939  0.023727  0.010915  0.178425   
3  0.086588  0.002020  0.038083  0.179733  0.051087  0.000959  0.052355   
4  0.056484  0.000551  0.027366  0.156085  0.073481  0.015179  0.026794   

         S7        S8        S9  ...      S390      S391      S392      S393  \
0  0.017243  0.000000  0.003856  ...  0.013926  0.001387  0.000221  0.096645   
1  0.014330  0.000426  0.033317  ...  0.017063  0.003254  0.003020  0.068192   
2  0.036323  0.000863  0.018699  ...  0.021212  0.007629  0.002187  0.070927   
3  0.002350  0.001508  0.014987  ...  0.015350  0.008165  0.000509  0.023601   
4  0.016925  0.003840  0.064901  ...  0.017797  0.012577  0.004447  0.046960   

       S394      S395      S396      S397      S398  S399

In [13]:
# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data feature
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

print(test_X.shape)
print(test_X.head())
print(test_y.shape)
print(test_y.head())

(398, 400)
         S0        S1        S2        S3        S4        S5        S6  \
0  0.009888  0.004123  0.116588  0.094998  0.066686  0.007710  0.055892   
1  0.022243  0.002129  0.154429  0.055656  0.058430  0.008530  0.102337   
2  0.029053  0.003213  0.126304  0.172730  0.069670  0.003825  0.029915   
3  0.008119  0.007272  0.058020  0.025384  0.054530  0.002960  0.186647   
4  0.002420  0.022335  0.121208  0.031558  0.021832  0.001323  0.081232   

         S7        S8        S9  ...      S390      S391      S392      S393  \
0  0.024954  0.001430  0.270837  ...  0.009074  0.006835  0.001013  0.049247   
1  0.008162  0.002173  0.008829  ...  0.007937  0.013315  0.009805  0.028784   
2  0.007535  0.003086  0.025696  ...  0.004491  0.006194  0.002209  0.054789   
3  0.013849  0.003880  0.053611  ...  0.015061  0.002967  0.003969  0.092107   
4  0.016293  0.000289  0.017291  ...  0.011558  0.001658  0.000473  0.093540   

       S394      S395      S396      S397      S398  S399

In [27]:
my_dict = {'P000':0.0000001, 'P001': 97, 'P002': 24, 'P003': 44}

classifiers = [
    svm.SVC(gamma='scale', class_weight=my_dict,decision_function_shape = 'ovr', kernel='rbf'),
]

for model in classifiers:
    clf = model

    # Fit the model to training
    clf.fit(train_X,train_y)

    # Check prediction accuracy
    prediction = clf.predict(test_X)
    print(type(clf), metrics.f1_score(prediction,test_y,average=None))

<class 'sklearn.svm.classes.SVC'> [0.         0.13407821 0.36734694 0.16814159]


  'recall', 'true', average, warn_for)


In [28]:
# Get average precision scores

clf = svm.SVC(gamma='scale', probability=True, class_weight=my_dict,decision_function_shape = 'ovr')

# Fit the model to training
clf.fit(train_X,train_y)

# Check prediction accuracy
prediction = clf.decision_function(test_X)

prob_list = prediction[:,1]
x = np.array([test_y == 'P001'][0]).astype(int)
print('P001 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,2]
x = np.array([test_y == 'P002'][0]).astype(int)
print('P002 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,3]
x = np.array([test_y == 'P003'][0]).astype(int)
print('P003 &', round(average_precision_score(x,prob_list, pos_label=1),4))

P001 & 0.2867
P002 & 0.3892
P003 & 0.0917
