### This notebook shows a comparison of different machine learning methods
- the input features were BERT embeddings, K Nearest Neighbours was the most accurate

##### Bootstrapping 
- with 500 texts in each class in the training set


In [2]:
# imports
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load data
datafile_path = "bc_cam_bert_embeddings.csv"

df = pd.read_csv(datafile_path)

df['cefr'] = df['filename'].str[:2]
# convert all C level labels to B2
df['cefr'] = df['cefr'].replace({'C1': 'B2', 'C2': 'B2'})
df

Unnamed: 0,filename,bert_1,bert_2,bert_3,bert_4,bert_5,bert_6,bert_7,bert_8,bert_9,...,bert_760,bert_761,bert_762,bert_763,bert_764,bert_765,bert_766,bert_767,bert_768,cefr
0,A1Movers_1_1,0.053010,0.293768,-0.097592,0.346060,0.383217,0.227773,0.167976,-0.140662,-0.051727,...,0.040110,-0.104577,-0.307515,0.023724,-0.186511,0.129780,0.211728,0.133709,-0.000617,A1
1,A1Movers_1_2,0.414724,0.265030,-0.254253,0.262461,0.390834,0.005777,0.160487,-0.017125,-0.163743,...,-0.006873,-0.131482,-0.299764,0.054476,-0.014990,0.220584,0.394695,-0.108314,0.111879,A1
2,A1Movers_1_3,0.311979,0.260253,-0.287606,0.144175,0.477552,0.144640,0.136279,-0.092433,-0.102352,...,-0.110906,-0.043273,-0.289649,0.075836,0.105506,0.341360,0.331848,-0.089021,0.036760,A1
3,A1Movers_1_4,0.168283,0.283034,-0.174079,0.160189,0.326133,0.182786,0.190903,0.019549,-0.031205,...,-0.019079,-0.116316,-0.387733,0.029947,-0.093259,0.178463,0.332097,0.003985,0.090877,A1
4,A1Movers_1_5,0.113609,0.013535,-0.062040,0.289461,0.302949,0.093426,0.060676,0.092458,-0.120968,...,0.057574,-0.120216,-0.294299,-0.204686,-0.083530,0.180238,0.424102,-0.012113,0.091106,A1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723,C2Prof_16-20,0.401525,0.031742,-0.295002,-0.014102,0.473157,0.115989,-0.092625,0.104994,0.035862,...,0.069934,-0.223100,-0.089625,0.094113,0.173621,0.136563,0.221396,-0.035504,0.075902,B2
724,C2Prof_21-30,0.330500,0.121378,-0.341339,-0.041109,0.434514,0.115061,-0.054262,0.167414,-0.039859,...,0.060851,-0.299681,-0.128314,-0.010718,0.302664,0.241780,0.271533,-0.116985,0.036222,B2
725,C2Prof_3-4,0.201988,0.069796,-0.060673,0.190671,0.463188,0.168991,-0.028868,-0.002389,-0.154003,...,-0.029062,-0.339931,-0.423854,0.046176,0.072496,0.173474,0.287201,-0.003583,-0.003119,B2
726,C2Prof_5-6,0.288296,0.146299,-0.158673,0.102414,0.435514,0.109758,-0.014951,0.098363,0.190461,...,-0.090920,-0.452645,-0.381974,0.172661,0.119560,0.190162,0.227809,-0.041571,-0.037134,B2


I adapted the code, so the texts in the training set could be bootstrapped (using resample in sklearn)
- maybe this code is not DRY (I think there are 1 or 2 uneccesary steps and it could be cleaner, but it works)

In [7]:
import ast
from sklearn.utils import resample

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['cefr', 'filename'], axis=1), df['cefr'], test_size=0.2, random_state=160923
)

# Convert the training set lists to a DataFrame
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
class_A1 = df_train[df_train['cefr'] == "A1"]
class_A2 = df_train[df_train['cefr'] == "A2"]
class_B1 = df_train[df_train['cefr'] == "B1"]
class_B2 = df_train[df_train['cefr'] == "B2"]

# Bootstrap each class in the training set to have 500 samples
class_A1_sampled = resample(class_A1, replace=True, n_samples=500, random_state=160923)
class_A2_sampled = resample(class_A2, replace=True, n_samples=500, random_state=160923)
class_B1_sampled = resample(class_B1, replace=True, n_samples=500, random_state=160923)
class_B2_sampled = resample(class_B2, replace=True, n_samples=500, random_state=160923)

# Concatenate the bootstrapped classes back together
df_train_sampled = pd.concat([class_A1_sampled, class_A2_sampled, class_B1_sampled, class_B2_sampled])

# Now can use df_train_sampled for machine learning tasks
X_train_sampled = df_train_sampled.drop('cefr', axis=1)
y_train_sampled = df_train_sampled.cefr

RandomForestClassifier

In [8]:
# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.57      0.50      0.53         8
          A2       0.50      0.38      0.43        13
          B1       0.59      0.52      0.55        44
          B2       0.78      0.86      0.82        81

    accuracy                           0.70       146
   macro avg       0.61      0.57      0.59       146
weighted avg       0.69      0.70      0.69       146

Confusion Matrix:
[[ 4  3  1  0]
 [ 2  5  4  2]
 [ 1  2 23 18]
 [ 0  0 11 70]]


Try Support Vector Machines

In [9]:
from sklearn import svm

# train SVM classifier
clf = svm.SVC(probability=True)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.45      0.62      0.53         8
          A2       0.43      0.46      0.44        13
          B1       0.60      0.57      0.58        44
          B2       0.82      0.80      0.81        81

    accuracy                           0.69       146
   macro avg       0.58      0.61      0.59       146
weighted avg       0.70      0.69      0.69       146

Confusion Matrix:
[[ 5  3  0  0]
 [ 4  6  3  0]
 [ 1  4 25 14]
 [ 1  1 14 65]]


K-Nearest Neighbors (KNN): This is a type of instance-based learning that classifies a data point based on how its neighbors are classified.


In [10]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.70      0.88      0.78         8
          A2       0.38      0.38      0.38        13
          B1       0.54      0.61      0.57        44
          B2       0.82      0.74      0.78        81

    accuracy                           0.68       146
   macro avg       0.61      0.65      0.63       146
weighted avg       0.69      0.68      0.68       146

Confusion Matrix:
[[ 7  1  0  0]
 [ 2  5  6  0]
 [ 1  3 27 13]
 [ 0  4 17 60]]


Neural Networks: Deep learning models, especially neural networks, can be very effective on tasks with large amounts of data and many input features. Scikit-learn provides simple neural networks models through the MLPClassifier class.


In [11]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(10, 10, 10))

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.55      0.75      0.63         8
          A2       0.50      0.31      0.38        13
          B1       0.55      0.50      0.52        44
          B2       0.76      0.81      0.79        81

    accuracy                           0.67       146
   macro avg       0.59      0.59      0.58       146
weighted avg       0.66      0.67      0.66       146

Confusion Matrix:
[[ 6  2  0  0]
 [ 5  4  3  1]
 [ 0  2 22 20]
 [ 0  0 15 66]]


In [10]:
len(X_train[0])

1536