## Train classifier by layers

This notebook trains a classifier that operates in two layers:
- First we use a SVM classifier to label utterances with high degree of certainty.
- Afterwards we use heuristics to complete the labeling

### Import and path definition

In [2]:
import os
import pandas as pd
import numpy as np
import random
import pickle
import sys
import matplotlib.pyplot as plt

root_path = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
sys.path.append(root_path)

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from src import phase_classification as pc

data_path = os.path.join(root_path,'data')
tables_path = os.path.join(data_path,'tables')

### Load data

In [3]:
WITH_STEMMING = True
#REMOVE_STOPWORDS = True
SEED = 10
NUM_TOPICS = 60
random.seed(SEED)

In [4]:
file_name = '[train]IBL_topic_distribution_by_utterance_minimum_5_words_with_stemming_{}_{}.xlsx'.format(WITH_STEMMING,NUM_TOPICS)
df_data = pd.read_excel(os.path.join(tables_path,'train',file_name))

### Data description

In [5]:
df_data.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,Topic 60,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time
0,0.009546,0.012683,0.006349,0.005774,0.003684,0.005008,0.004698,0.00868,0.012694,0.015815,...,0.011693,1,1,0,0,0,0,SATUNNAINEN,0.0,0.0
1,0.102344,0.009047,0.004529,0.004119,0.002628,0.003573,0.003351,0.006192,0.009057,0.011282,...,0.008342,1,1,0,0,0,0,siel QUESTION_SYMBOL onks oo palo,0.065574,0.010417
2,0.102352,0.009047,0.004529,0.004119,0.002628,0.003573,0.003351,0.006192,0.009055,0.011302,...,0.008342,1,1,0,0,0,0,no tää varm muutu,0.04918,0.017361
3,0.005717,0.007596,0.003803,0.077458,0.002207,0.003,0.002814,0.005199,0.007603,0.009472,...,0.007004,1,1,0,0,0,0,aja T_ ATOMI ehtiny törm ton ver,0.098361,0.020833
4,0.11164,0.004865,0.002435,0.002215,0.001413,0.001921,0.001802,0.003329,0.004869,0.006066,...,0.004486,1,1,0,0,0,0,ton ver ai pitä muute kato sil taas oo lask to...,0.213115,0.027778


In [6]:
the_keys = list(set(df_data['phase']))
total_samples = 0
class_samples = {}
for key in the_keys:
    n = list(df_data.phase.values).count(key)
    #print("key {}, total {}".format(key,n))
    total_samples += n
    class_samples[key] = n
print(total_samples)
proportions = []
for key in the_keys:
    proportion = round(class_samples[key]*1.0/total_samples,2)    
    print("key {}, samples: {}, prop: {}".format(key,class_samples[key],proportion))

1600
key 1, samples: 403, prop: 0.25
key 2, samples: 175, prop: 0.11
key 3, samples: 406, prop: 0.25
key 4, samples: 62, prop: 0.04
key 5, samples: 554, prop: 0.35


### Split data

In [7]:
filter_rows = list(range(60))+[67,68]
row_label = 60

In [8]:
dfs_train,dfs_val = pc.split_df_discussions(df_data,.2,SEED)
X_train,y_train = pc.get_joined_data_from_df(dfs_train,filter_rows,row_label)
X_val,y_val = pc.get_joined_data_from_df(dfs_val,filter_rows,row_label)

In [9]:
len(X_train)

1328

In [10]:
dfs_all,_ = pc.split_df_discussions(df_data,.0,SEED)
X_all,y_all = pc.get_joined_data_from_df(dfs_all,filter_rows,row_label)

### Classify first layer

In [11]:
import importlib
importlib.reload(pc)

<module 'src.phase_classification' from 'C:\\Users\\CATALINA ESPINOZA\\Documents\\ciae\\Classification_IBL\\src\\phase_classification.py'>

In [12]:
class_weight = {}
for key in the_keys:
    class_weight[key] = 1000.0/class_samples[key]

In [13]:
svc = SVC(kernel='linear',random_state=SEED,max_iter=5000,probability=True,class_weight=class_weight)
svc.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svc.score(X_val, y_val)))

Accuracy of SVM classifier on training set: 0.40
Accuracy of SVM classifier on test set: 0.33


In [14]:
pred = svc.predict(X_val)
labels = ["Phase {}".format(i) for i in range(1,6)]
df = pd.DataFrame(confusion_matrix(y_val, pred),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
#print(" ")
print(classification_report(y_val, pred))
df

              precision    recall  f1-score   support

           1       0.68      0.60      0.64       101
           2       0.09      0.56      0.15         9
           3       0.30      0.05      0.08        63
           4       0.04      1.00      0.08         3
           5       0.51      0.20      0.29        96

   micro avg       0.33      0.33      0.33       272
   macro avg       0.32      0.48      0.25       272
weighted avg       0.51      0.33      0.36       272



Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,61,20,3,10,7
Phase 2,0,5,0,3,1
Phase 3,16,15,3,19,10
Phase 4,0,0,0,3,0
Phase 5,13,18,4,42,19


In [15]:
pred_val = svc.predict_proba(X_all)

In [16]:
output_first_layer = [np.argmax(pred_val[i])+1 for i in range(len(pred_val))]

In [17]:
df_data['first_layer'] = output_first_layer

In [18]:
df_data.head()

Unnamed: 0,Topic 1,Topic 2,Topic 3,Topic 4,Topic 5,Topic 6,Topic 7,Topic 8,Topic 9,Topic 10,...,phase,phase_1,phase_2,phase_3,phase_4,phase_5,utterance,length utterance,utterance_relative_time,first_layer
0,0.009546,0.012683,0.006349,0.005774,0.003684,0.005008,0.004698,0.00868,0.012694,0.015815,...,1,1,0,0,0,0,SATUNNAINEN,0.0,0.0,1
1,0.102344,0.009047,0.004529,0.004119,0.002628,0.003573,0.003351,0.006192,0.009057,0.011282,...,1,1,0,0,0,0,siel QUESTION_SYMBOL onks oo palo,0.065574,0.010417,1
2,0.102352,0.009047,0.004529,0.004119,0.002628,0.003573,0.003351,0.006192,0.009055,0.011302,...,1,1,0,0,0,0,no tää varm muutu,0.04918,0.017361,1
3,0.005717,0.007596,0.003803,0.077458,0.002207,0.003,0.002814,0.005199,0.007603,0.009472,...,1,1,0,0,0,0,aja T_ ATOMI ehtiny törm ton ver,0.098361,0.020833,1
4,0.11164,0.004865,0.002435,0.002215,0.001413,0.001921,0.001802,0.003329,0.004869,0.006066,...,1,1,0,0,0,0,ton ver ai pitä muute kato sil taas oo lask to...,0.213115,0.027778,1


In [19]:
df_data.to_excel(os.path.join(tables_path,'[one_layer]'+file_name))

In [20]:
with open(os.path.join(data_path,'classifier_svm_one_layer_one_utterance.pickle'),'wb') as f:
    pickle.dump(svc,f)

In [21]:
df = pd.DataFrame(confusion_matrix(y_all, output_first_layer),columns=["Predicted {}".format(i) for i in labels])
df.index = labels
print(classification_report(y_all, output_first_layer))
df

              precision    recall  f1-score   support

           1       0.54      0.66      0.59       403
           2       0.54      0.04      0.07       175
           3       0.68      0.35      0.46       406
           4       0.67      0.03      0.06        62
           5       0.48      0.76      0.58       554

   micro avg       0.52      0.52      0.52      1600
   macro avg       0.58      0.37      0.36      1600
weighted avg       0.56      0.52      0.48      1600



Unnamed: 0,Predicted Phase 1,Predicted Phase 2,Predicted Phase 3,Predicted Phase 4,Predicted Phase 5
Phase 1,267,4,28,0,104
Phase 2,42,7,6,0,120
Phase 3,84,0,142,0,180
Phase 4,0,1,2,2,57
Phase 5,103,1,30,1,419


In [26]:
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svc.score(X_all, output_first_layer)))

Accuracy of SVM classifier on training set: 0.51
