### This notebook shows a comparison of different machine learning methods
- the input features were OpenAI embeddings, Support Vector Machines was the most accurate

##### Bootstrapping 
- with 500 texts in each class in the training set


In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load data
datafile_path = "bc_cam_with_ada_002_embeddings.csv"

df = pd.read_csv(datafile_path)
# convert all C level labels to B2
df['cefr'] = df['cefr'].replace({'C': 'B2'})
df

Unnamed: 0.1,Unnamed: 0,filename,text,cefr,embedding
0,0,A1Movers_1_1,"Look, Grandpa. My friend's family are in the g...",A1,"[0.010332267731428146, -0.0009531814139336348,..."
1,1,A1Movers_1_2,"Come quickly, children. The train's waiting to...",A1,"[0.002182072727009654, -6.590186239918694e-05,..."
2,2,A1Movers_1_3,"Hello, Mrs Castle. Hello Sally, Oh I'm tired. ...",A1,"[-0.00018498786084819585, 0.013357731513679028..."
3,3,A1Movers_1_4,"Dad, come and watch this DVD with me. What's i...",A1,"[0.017183320596814156, -0.00948919914662838, 0..."
4,4,A1Movers_1_5,Can you colour this mountain picture now? Yes!...,A1,"[0.01187464315444231, 0.009958968497812748, 0...."
...,...,...,...,...,...
723,723,C2Prof_16-20,"Today, we're talking to marine biologists Gina...",B2,"[0.0013554414035752416, -0.0029449746944010258..."
724,724,C2Prof_21-30,I knew I'd be short of money if I didn't work ...,B2,"[-0.007415663916617632, -0.02614154852926731, ..."
725,725,C2Prof_3-4,"Last year, Tim Fitzgerald exhibited photograph...",B2,"[-0.009252717718482018, 0.008551654405891895, ..."
726,726,C2Prof_5-6,One of my own thoughts about this piece is the...,B2,"[-0.02017894573509693, -0.001436770660802722, ..."


I adapted the code, so the texts in the training set could be bootstrapped (using resample in sklearn)
- maybe this code is not DRY (I think there are 1 or 2 uneccesary steps and it could be cleaner, but it works)

In [2]:
import ast
from sklearn.utils import resample

# Use ast.literal_eval to safely evaluate the string and convert it into a list
df['embedding'] = df['embedding'].apply(ast.literal_eval)

# create a column for each embedding
df_embeddings = pd.DataFrame(df['embedding'].to_list(), columns=[f'embed_{i}' for i in range(len(df['embedding'][0]))])

# Add the labels back
df_embeddings = pd.concat([df_embeddings, df["cefr"]], axis=1)

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df_embeddings.drop('cefr', axis=1), df_embeddings['cefr'], test_size=0.2, random_state=160923
)

# Convert the training set lists to a DataFrame
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
class_A1 = df_train[df_train['cefr'] == "A1"]
class_A2 = df_train[df_train['cefr'] == "A2"]
class_B1 = df_train[df_train['cefr'] == "B1"]
class_B2 = df_train[df_train['cefr'] == "B2"]

# Bootstrap each class in the training set to have 500 samples
class_A1_sampled = resample(class_A1, replace=True, n_samples=500, random_state=160923)
class_A2_sampled = resample(class_A2, replace=True, n_samples=500, random_state=160923)
class_B1_sampled = resample(class_B1, replace=True, n_samples=500, random_state=160923)
class_B2_sampled = resample(class_B2, replace=True, n_samples=500, random_state=160923)

# Concatenate the bootstrapped classes back together
df_train_sampled = pd.concat([class_A1_sampled, class_A2_sampled, class_B1_sampled, class_B2_sampled])

# Now can use df_train_sampled for machine learning tasks
X_train_sampled = df_train_sampled.drop('cefr', axis=1)
y_train_sampled = df_train_sampled.cefr

RandomForestClassifier

In [26]:
# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.83      0.62      0.71         8
          A2       0.75      0.46      0.57        13
          B1       0.77      0.55      0.64        44
          B2       0.74      0.93      0.82        81

    accuracy                           0.75       146
   macro avg       0.78      0.64      0.69       146
weighted avg       0.76      0.75      0.74       146

Confusion Matrix:
[[ 5  1  0  2]
 [ 1  6  2  4]
 [ 0  0 24 20]
 [ 0  1  5 75]]


Try Support Vector Machines

In [4]:
from sklearn import svm

# train SVM classifier
# the default kernel is rbf (gaussian)
clf = svm.SVC(probability=True)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.64      0.88      0.74         8
          A2       0.59      0.77      0.67        13
          B1       0.79      0.68      0.73        44
          B2       0.86      0.85      0.86        81

    accuracy                           0.79       146
   macro avg       0.72      0.79      0.75       146
weighted avg       0.80      0.79      0.80       146

Confusion Matrix:
[[ 7  1  0  0]
 [ 2 10  1  0]
 [ 1  2 30 11]
 [ 1  4  7 69]]


K-Nearest Neighbors (KNN): This is a type of instance-based learning that classifies a data point based on how its neighbors are classified.


In [28]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.42      0.62      0.50         8
          A2       0.44      0.54      0.48        13
          B1       0.49      0.75      0.59        44
          B2       0.86      0.54      0.67        81

    accuracy                           0.61       146
   macro avg       0.55      0.61      0.56       146
weighted avg       0.69      0.61      0.62       146

Confusion Matrix:
[[ 5  3  0  0]
 [ 2  7  4  0]
 [ 3  1 33  7]
 [ 2  5 30 44]]


Neural Networks: Deep learning models, especially neural networks, can be very effective on tasks with large amounts of data and many input features. Scikit-learn provides simple neural networks models through the MLPClassifier class.


In [29]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(10, 10, 10))

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.57      0.50      0.53         8
          A2       0.53      0.62      0.57        13
          B1       0.80      0.64      0.71        44
          B2       0.83      0.91      0.87        81

    accuracy                           0.78       146
   macro avg       0.68      0.67      0.67       146
weighted avg       0.78      0.78      0.78       146

Confusion Matrix:
[[ 4  4  0  0]
 [ 3  8  1  1]
 [ 0  2 28 14]
 [ 0  1  6 74]]


In [10]:
len(X_train[0])

1536