# Demonstration of data loading and model training with BERT vectors

In [1]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score

In [4]:
ORIGINAL_DATA_DIR = os.path.join("..", "Solution", "data")
BERT_FEATURE_DIR = "bert_output_data"

## Format training data

`X` will be a matrix with `N` rows for the `N` texts in the training data, and `M` columns for the `M` features generated by BERT.

`y` will be an array of `N` class labels for training.

In [5]:
train_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_train.csv"))

In [6]:
train_df.shape

(6000, 2)

In [8]:
bert_vectors = []
with open(os.path.join(BERT_FEATURE_DIR, "train.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data = json.loads(line)
        for t in bert_data["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vectors.append(t["layers"][0]["values"])
                break

In [9]:
len(bert_vectors)

6000

In [10]:
X = np.array(bert_vectors)
y = train_df["native_language"].values

## Train logistic regression model

In [11]:
lr_model = LogisticRegression(penalty="l2", C=1.0)
lr_model.fit(X, y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
test_df = pd.read_csv(os.path.join(ORIGINAL_DATA_DIR, "lang_id_test.csv"))

In [14]:
bert_vec_test = []
with open(os.path.join(BERT_FEATURE_DIR, "test.jsonlines"), "rt") as infile:
    for line in infile:
        bert_data_test = json.loads(line)
        for t in bert_data_test["features"]:
            # Only extract the [CLS] vector used for classification
            if t["token"] == "[CLS]":
                # We only use the representation at the final layer of the network
                bert_vec_test.append(t["layers"][0]["values"])
                break

In [58]:
X_test = np.array(bert_vec_test)
y_test = test_df["native_language"].values
X_test.shape

(2000, 768)

In [60]:
y_pred = lr_model.predict(X_test)
np.unique(y_pred)

array(['Arabic', 'Cantonese', 'Japanese', 'Korean', 'Mandarin', 'Polish',
       'Russian', 'Spanish', 'Thai', 'Vietnamese'], dtype=object)

In [31]:
print('Accuracy of logistic regression classifier on test dataset: {:.4f}'.format(lr_model.score(X_test,y_test)))

Accuracy of logistic regression classifier on test dataset: 0.4560


In [20]:
from sklearn.metrics import confusion_matrix

#CONFUSION MATRIX
conf = confusion_matrix(y_test, y_pred)
print(conf)

In [23]:
#CLASSIFICATION REPORT
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

      Arabic       0.48      0.48      0.48       200
   Cantonese       0.31      0.29      0.30       200
    Japanese       0.47      0.53      0.50       200
      Korean       0.46      0.42      0.44       200
    Mandarin       0.31      0.32      0.31       200
      Polish       0.48      0.48      0.48       200
     Russian       0.51      0.57      0.54       200
     Spanish       0.52      0.50      0.51       200
        Thai       0.59      0.59      0.59       200
  Vietnamese       0.42      0.39      0.40       200

    accuracy                           0.46      2000
   macro avg       0.45      0.46      0.45      2000
weighted avg       0.45      0.46      0.45      2000



In [56]:
#CLASSIFICATION DATA WITH MISSCLASSIFICATION RATE
test_df['y_pred'] = y_pred
group = test_df.groupby('y_pred').count()
group_vals = group['native_language']
mcs = []
for item in range(len(sorted(group.index))):
    m = ((200 - conf[item][item] + (group_vals[item] - conf[item][item])) / 2000) * 100
    mcs.append(m)
a,b,c,d = precision_recall_fscore_support(test_df['native_language'], test_df['y_pred'])
l = sorted(test_df['native_language'].unique())
for i in range(10):
    print(l[i], '- {:.4f}'.format(mcs[i]) )

Arabic - 10.4000
Cantonese - 13.6000
Japanese - 10.6000
Korean - 10.7000
Mandarin - 13.9000
Polish - 10.5000
Russian - 9.8000
Spanish - 9.7000
Thai - 8.2000
Vietnamese - 11.4000


In [64]:
#SUMMARY
print("Summary of Evaluation Findings:")
print("Total test data:", test_df.shape[0])
print("Incorrect prediction:", np.array(mcs).sum()*10)
print("Correct prediction:", (test_df.shape[0] - np.array(mcs).sum()*10))

Summary of Evaluation Findings:
Total test data: 2000
Incorrect prediction: 1088.0
Correct prediction: 912.0


In [26]:
#SVM 
from sklearn.svm import SVC

In [27]:
svc_rbf = SVC(kernel = 'rbf', C=1e3, gamma = 0.1)

In [28]:
svc_rbf.fit(X,y)

SVC(C=1000.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [29]:
svc_rbf.score(X_test,y_test)

0.3475