In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [3]:
from laserembeddings import Laser

laser = Laser()

In [4]:
dt = pd.read_csv('train.csv')
dt

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1
...,...,...,...,...,...,...
12115,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,en,English,2
12116,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,en,English,0
12117,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",en,English,2
12118,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,en,English,2


In [5]:
x = dt.premise.values +' '+ dt.hypothesis.values
y = dt.label.values
z = dt.lang_abv.values
# split train and test
x_train, x_test, y_train, y_test,z_train,z_test = train_test_split(x, y,z, test_size=0.3, random_state=0)

In [6]:
x_train

array(['เมื่อการตัดสินใจเหล่านี้ได้เกิดขึ้นไปแล้ว, องค์กร CIO ต้องให้การสนับสนุนที่มีประสิทธิภาพและพร้อมตอบสนองผ่านการจัดสรรทรัพยากรที่มีประสิทธิภาพและการดำเนินการตามหน้าที่ในแต่ละวัน องค์กร CIO มักจะมีบุคลากรเจ็ดคนที่คอยให้การสนับสนุนในกรณีเหล่านี้',
       "Do you know how long we've been here? he asked one morning as they sat facing each other at breakfast. They were sitting across from each other at breakfast.  ",
       "Look for these items in the picturesque open-air market of Sa Penya (Ibiza Town) or for a wider selection at the bustling, covered central market in the newer part of town (carrer d'Extremadura). You can't find anything at the open-air market.",
       ...,
       'Anfang Mai 1996 bekam die CIA mit, dass Bin Laden den Sudan vielleicht verlassen hatte. Die CIA war sich sicher, dass Bin Ladin mehrere Jahre im Sudan bleiben würde.',
       'Sie war es nun, die sich verteidigte, ihre Stimme zitterte vor Entrüstung. Die Frau war so erfreut, dass sie sprachlos war!',
  

In [7]:
embeddings = laser.embed_sentences([x for x in x_train], lang = z_train)

In [8]:
test_emb = laser.embed_sentences([x for x in x_test], lang = z_test)

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
model = RandomForestClassifier()
model.fit(embeddings,y_train)

predictions = model.predict(test_emb)
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

           0       0.38      0.44      0.41      1237
           1       0.35      0.28      0.32      1147
           2       0.49      0.50      0.49      1252

    accuracy                           0.41      3636
   macro avg       0.41      0.41      0.41      3636
weighted avg       0.41      0.41      0.41      3636

Confusion matrix:
[[544 336 357]
 [524 326 297]
 [371 257 624]]


In [11]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [12]:
# data preprocessing
from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler

scaler =  MinMaxScaler()
scaler.fit(embeddings)
X_train = scaler.transform(embeddings)
X_test = scaler.transform(test_emb)

In [14]:
param_grid = [{#'sel__k': [8, 10, 5],
    'learner__max_depth': [80, 100],
    'learner__max_features': [2, 3],
    'learner__min_samples_leaf': [3, 4],
    #'learner__min_samples_split': [8, 10],
    'learner__n_estimators': [100, 300]}]

pipe = Pipeline([
    #('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', RandomForestClassifier(bootstrap = True))  # learning algorithm
])

n_jobs = 3 #Number of jobs to run in parallel
pipe_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs = n_jobs, verbose=True).fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [20]:
pipe_search.best_params_

{'learner__max_depth': 100,
 'learner__max_features': 3,
 'learner__min_samples_leaf': 4,
 'learner__n_estimators': 300}

In [15]:
predictions = pipe_search.best_estimator_.predict(X_test)
print('Classification report:')
print(classification_report(y_test, predictions))
print('Confusion matrix:')
cm = confusion_matrix(y_test, predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

           0       0.33      0.41      0.36      1237
           1       0.33      0.27      0.30      1147
           2       0.41      0.38      0.39      1252

    accuracy                           0.35      3636
   macro avg       0.36      0.35      0.35      3636
weighted avg       0.36      0.35      0.35      3636

Confusion matrix:
[[505 346 386]
 [546 315 286]
 [488 294 470]]


In [16]:
# without pipeline
clf = LinearSVC(C=1.0, random_state=42)
clf.fit(embeddings, y_train)

y_pred = clf.predict(test_emb)

print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))
print(classification_report(y_test, y_pred))

Accuracy 0.5002750275027503
F1-score [0.49922958 0.44928826 0.54859967]
              precision    recall  f1-score   support

           0       0.48      0.52      0.50      1237
           1       0.46      0.44      0.45      1147
           2       0.57      0.53      0.55      1252

    accuracy                           0.50      3636
   macro avg       0.50      0.50      0.50      3636
weighted avg       0.50      0.50      0.50      3636



In [18]:
#with pipeline and grid serach
param_grid = [{'sel__k': [8, 10, 50,'all'], 'learner__C': [0.1, 0.5, 1, 10]}]

opt_pipeline = Pipeline([
    
    ('sel', SelectKBest(chi2)),  # feature selection
    ('tfidf', TfidfTransformer()),  # weighting
    ('learner', LinearSVC())  # learning algorithm
])

n_jobs = 3 #Number of jobs to run in parallel
SVM_opt_search = GridSearchCV(opt_pipeline, param_grid, n_jobs = n_jobs, verbose=True).fit(X_train,y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


In [19]:
opt_predictions = SVM_opt_search.best_estimator_.predict(X_test)

print('Classification report:')
print(classification_report(y_test, opt_predictions, zero_division = 1))
print('Confusion matrix:')
cm = confusion_matrix(y_test, opt_predictions)
print(cm)

Classification report:
              precision    recall  f1-score   support

           0       0.47      0.51      0.49      1237
           1       0.46      0.45      0.45      1147
           2       0.57      0.53      0.55      1252

    accuracy                           0.50      3636
   macro avg       0.50      0.50      0.50      3636
weighted avg       0.50      0.50      0.50      3636

Confusion matrix:
[[633 340 264]
 [397 513 237]
 [322 270 660]]
