In [30]:
#!/bin/python

import numpy as np
import pandas as pd
import os
import pickle
import sys
import scipy
from pathlib import Path
from collections import Counter
import random
import copy

# Machine Learning libraries
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import svm
from sklearn import metrics
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import average_precision_score

#s_trn_file_path = "../surf_bow/surf_400_trn.csv"
#s_val_file_path = "../surf_bow/surf_400_val.csv"
#s_test_file_path = "../surf_bow/surf_400_test.csv"

s_trn_file_path = "../cnn_bow/cnn_trn.csv"
s_val_file_path = "../cnn_bow/cnn_val.csv"
s_test_file_path = "../cnn_bow/cnn_test.csv"

In [31]:
s_train_df = pd.read_csv(s_trn_file_path, index_col='Unnamed: 0')
s_train_df.drop(['name'], axis=1, inplace=True )

train_df = s_train_df
train_df.target.fillna('P000', inplace=True)
train_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
train_df_target = pd.DataFrame(train_df['target'], columns=['target'])
train_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(train_df)
X_train_tf = tf_transformer.transform(train_df)
train_df = pd.DataFrame(X_train_tf.todense(), columns=train_df.columns.values)

# 3. Add back the target column
train_df = pd.concat([train_df, train_df_target], axis=1)

##

s_test_df = pd.read_csv(s_val_file_path, index_col='Unnamed: 0')
s_test_df.drop(['name'], axis=1, inplace=True )

test_df = s_test_df
test_df.target.fillna('P000', inplace=True)
test_df.fillna(0.0, inplace=True)

##

### tf_idf conversion

# 1. Save target column, and drop if from dataframe
test_df_target = pd.DataFrame(test_df['target'], columns=['target'])
test_df.drop(['target'], axis=1, inplace=True )

# 2. Replace frequencies with tf_idf scores
tf_transformer = TfidfTransformer(use_idf=True).fit(test_df)
X_train_tf = tf_transformer.transform(test_df)
test_df = pd.DataFrame(X_train_tf.todense(), columns=test_df.columns.values)

# 3. Add back the target column
test_df = pd.concat([test_df, test_df_target], axis=1)

##

# Machine Learning
prediction_var = list(train_df.columns)
prediction_var.remove('target')

# Get input training data
train_X = train_df[prediction_var]

# Get input target variable
train_y = train_df.target

print(train_X.shape)
print(train_X.head())
print(train_y.shape)
print(train_y.head())

(836, 512)
         C0        C1        C2        C3        C4        C5        C6  \
0  0.012177  0.020665  0.002715  0.011379  0.002106  0.008183  0.011365   
1  0.050662  0.014998  0.018357  0.063370  0.039172  0.020285  0.030536   
2  0.016230  0.028441  0.034494  0.057352  0.072758  0.015954  0.055472   
3  0.013791  0.065487  0.018861  0.034573  0.014245  0.063072  0.019853   
4  0.026996  0.082660  0.032480  0.059334  0.047574  0.027603  0.027391   

         C7        C8        C9  ...      C502      C503      C504      C505  \
0  0.016399  0.031824  0.050133  ...  0.006781  0.029680  0.081971  0.008358   
1  0.021552  0.027552  0.039417  ...  0.023543  0.012310  0.047073  0.027224   
2  0.046889  0.026508  0.087392  ...  0.020093  0.028061  0.065274  0.037863   
3  0.031549  0.056315  0.025867  ...  0.012239  0.026160  0.040487  0.052220   
4  0.019620  0.049927  0.049876  ...  0.012089  0.022369  0.017669  0.028769   

       C506      C507      C508      C509      C510      

In [32]:
# Machine Learning
prediction_var = list(test_df.columns)
prediction_var.remove('target')

# Get test data feature
test_X = test_df[prediction_var]

# Get test data target
test_y = test_df.target

print(test_X.shape)
print(test_X.head())
print(test_y.shape)
print(test_y.head())

(400, 512)
         C0        C1        C2        C3        C4        C5        C6  \
0  0.005508  0.016140  0.075379  0.032253  0.044768  0.053543  0.026177   
1  0.028625  0.052391  0.015350  0.043559  0.018316  0.014576  0.029624   
2  0.034180  0.092929  0.049218  0.047921  0.051233  0.027205  0.025017   
3  0.015915  0.044615  0.014435  0.028330  0.016788  0.036427  0.026045   
4  0.017253  0.022074  0.004148  0.024496  0.026041  0.042671  0.017511   

         C7        C8        C9  ...      C502      C503      C504      C505  \
0  0.011162  0.022530  0.031513  ...  0.008718  0.013667  0.021851  0.038773   
1  0.034718  0.021745  0.093738  ...  0.014001  0.018368  0.020672  0.018656   
2  0.016860  0.025233  0.040352  ...  0.035970  0.020212  0.022943  0.039593   
3  0.044271  0.034250  0.019638  ...  0.010226  0.037259  0.059388  0.017961   
4  0.019294  0.067897  0.034130  ...  0.012174  0.019160  0.060336  0.024862   

       C506      C507      C508      C509      C510      

In [33]:
my_dict = {'P000':0.0000001, 'P001': 97, 'P002': 24, 'P003': 54}

classifiers = [
    svm.SVC(gamma='scale', class_weight=my_dict,decision_function_shape = 'ovr', kernel='rbf'),
]

for model in classifiers:
    clf = model

    # Fit the model to training
    clf.fit(train_X,train_y)

    # Check prediction accuracy
    prediction = clf.predict(test_X)
    print(type(clf), metrics.f1_score(prediction,test_y,average=None))

<class 'sklearn.svm.classes.SVC'> [0.         0.1863354  0.4047619  0.20853081]


  'recall', 'true', average, warn_for)


In [34]:
# Get average precision scores

clf = svm.SVC(gamma='scale', probability=True, class_weight='balanced',decision_function_shape = 'ovr')

# Fit the model to training
clf.fit(train_X,train_y)

# Check prediction accuracy
prediction = clf.decision_function(test_X)

prob_list = prediction[:,1]
x = np.array([test_y == 'P001'][0]).astype(int)
print('P001 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,2]
x = np.array([test_y == 'P002'][0]).astype(int)
print('P002 &', round(average_precision_score(x,prob_list, pos_label=1),4))

prob_list = prediction[:,3]
x = np.array([test_y == 'P003'][0]).astype(int)
print('P003 &', round(average_precision_score(x,prob_list, pos_label=1),4))

P001 & 0.725
P002 & 0.7675
P003 & 0.5106
