## Train classifier by layers

This notebook trains a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

### Import and path definition

In [1]:
import os
import pandas as pd
import numpy as np
import random
import pickle
import sys
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')

### Load data

In [2]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)
CLASS_W = False

In [3]:
file_name = '[train]IBL_topic_distribution_by_utterance_with_phrase_before_and_after_time_utterance_minimum_0_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'train',file_name))

### Data description

In [4]:
df_data.head()

Unnamed: 0,Topic before 1,Topic before 2,Topic before 3,Topic before 4,Topic before 5,Topic before 6,Topic before 7,Topic before 8,Topic before 9,Topic before 10,...,Topic after 60,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time
0,0.022871,0.006985,0.015633,0.007367,0.005062,0.003185,0.008658,0.012719,0.003149,0.005543,...,0.003787,1,1,0,0,0,0,SATUNNAINEN,0.0,0.0
1,0.022871,0.006985,0.015633,0.007367,0.005062,0.003185,0.008658,0.012719,0.003149,0.005543,...,0.004338,1,1,0,0,0,0,siel QUESTION_SYMBOL onks oo palo,0.065574,0.010417
2,0.018513,0.005654,0.012654,0.005964,0.004097,0.002578,0.007008,0.010369,0.002549,0.004487,...,0.004043,1,1,0,0,0,0,no tää varm muutu,0.04918,0.017361
3,0.025611,0.006476,0.014495,0.006832,0.004693,0.002954,0.008028,0.011817,0.00292,0.00514,...,0.00302,1,1,0,0,0,0,aja T_ ATOMI ehtiny törm ton ver,0.098361,0.020833
4,0.020057,0.006037,0.013513,0.006368,0.004375,0.002753,0.007483,0.010995,0.002722,0.004792,...,0.005076,1,1,0,0,0,0,ton ver ai pitä muute kato sil taas oo lask to...,0.213115,0.027778


In [5]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
proportions = []
for key in the_keys:
    proportion = round(class_samples[key]*1.0/total_samples,2)    
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],proportion))

1600
key 1, samples: 403, prop: 0.25
key 2, samples: 175, prop: 0.11
key 3, samples: 406, prop: 0.25
key 4, samples: 62, prop: 0.04
key 5, samples: 554, prop: 0.35


### split data

In [6]:
filter_rows = list(range(180))+[187,188]
row_label = 180
dfs_train,dfs_val = pc.split_df_discussions(df_data,.2,SEED)
X_train,y_train = pc.get_joined_data_from_df(dfs_train,filter_rows,row_label)
X_val,y_val = pc.get_joined_data_from_df(dfs_val,filter_rows,row_label)
len(X_train)

1328

In [7]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)

### Classify first layer

In [8]:
len(X_train[0])

182

In [9]:
class_weight = {}
for key in the_keys:
    class_weight[key] = 1000.0/class_samples[key]

Note that if you remove class_weight then the accuracy is up to 0.5

In [10]:
if CLASS_W:
    svc = SVC(kernel='linear',random_state=SEED,probability=True,class_weight=class_weight)
else:
    svc = SVC(kernel='linear',random_state=SEED,probability=True)
svc.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svc.score(X_val, y_val)))

Accuracy of SVM classifier on training set: 0.51
Accuracy of SVM classifier on test set: 0.52


In [11]:
pred = svc.predict(X_val)
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_val, pred),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
#print(" ")
print(classification_report(y_val, pred))
df

              precision    recall  f1-score   support

           1       0.65      0.55      0.60       101
           2       0.00      0.00      0.00         9
           3       0.80      0.06      0.12        63
           4       0.00      0.00      0.00         3
           5       0.45      0.84      0.58        96

   micro avg       0.52      0.52      0.52       272
   macro avg       0.38      0.29      0.26       272
weighted avg       0.59      0.52      0.46       272



  'precision', 'predicted', average, warn_for)


Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,56,0,1,0,44
Phase 2,0,0,0,0,9
Phase 3,15,0,4,0,44
Phase 4,0,0,0,0,3
Phase 5,15,0,0,0,81


In [12]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, y_all)))

Accuracy of SVM classifier on training set: 0.51


In [13]:
pred_val = svc.predict_proba(X_all)

In [14]:
output_first_layer = [np.argmax(pred_val[i])+1 for i in range(len(pred_val))]

In [15]:
df_data['first_layer'] = output_first_layer

In [16]:
df_data.head()

Unnamed: 0,Topic before 1,Topic before 2,Topic before 3,Topic before 4,Topic before 5,Topic before 6,Topic before 7,Topic before 8,Topic before 9,Topic before 10,...,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time,first_layer
0,0.022871,0.006985,0.015633,0.007367,0.005062,0.003185,0.008658,0.012719,0.003149,0.005543,...,1,1,0,0,0,0,SATUNNAINEN,0.0,0.0,1
1,0.022871,0.006985,0.015633,0.007367,0.005062,0.003185,0.008658,0.012719,0.003149,0.005543,...,1,1,0,0,0,0,siel QUESTION_SYMBOL onks oo palo,0.065574,0.010417,1
2,0.018513,0.005654,0.012654,0.005964,0.004097,0.002578,0.007008,0.010369,0.002549,0.004487,...,1,1,0,0,0,0,no tää varm muutu,0.04918,0.017361,1
3,0.025611,0.006476,0.014495,0.006832,0.004693,0.002954,0.008028,0.011817,0.00292,0.00514,...,1,1,0,0,0,0,aja T_ ATOMI ehtiny törm ton ver,0.098361,0.020833,1
4,0.020057,0.006037,0.013513,0.006368,0.004375,0.002753,0.007483,0.010995,0.002722,0.004792,...,1,1,0,0,0,0,ton ver ai pitä muute kato sil taas oo lask to...,0.213115,0.027778,1


In [20]:
df_data.to_excel(os.path.join(tables_path,'[one_layer][cw_{}]'.format(CLASS_W)+file_name))

In [21]:
with open(os.path.join(data_path,'classifier_svm_one_layer_cw_{}.pickle'.format(CLASS_W)),'wb') as f:
    pickle.dump(svc,f)

In [22]:
df = pd.DataFrame(confusion_matrix(y_all, output_first_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, output_first_layer))
df

              precision    recall  f1-score   support

           1       0.54      0.65      0.59       403
           2       0.58      0.30      0.39       175
           3       0.70      0.42      0.53       406
           4       0.88      0.11      0.20        62
           5       0.53      0.74      0.61       554

   micro avg       0.56      0.56      0.56      1600
   macro avg       0.65      0.44      0.47      1600
weighted avg       0.60      0.56      0.55      1600



Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,261,17,20,0,105
Phase 2,45,52,10,0,68
Phase 3,80,3,172,0,151
Phase 4,1,5,3,7,46
Phase 5,93,12,39,1,409


In [23]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, y_all)))

Accuracy of SVM classifier on training set: 0.51
