### This notebook shows a comparison of different machine learning methods
- the input features were linguistic features, Random Forest Classifier was the most accurate

##### Bootstrapping 
- with 500 texts in each class in the training set


In [1]:
# imports
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# load data
datafile_path = "bc_cam_5_taales_etc_variables.csv"

df = pd.read_csv(datafile_path)

df['cefr'] = df['filename'].str[:2]
# convert all C level labels to B2
df['cefr'] = df['cefr'].replace({'C1': 'B2', 'C2': 'B2'})
df

Unnamed: 0,filename,cefr,Dep_Clauses_per_Clause,lexical_density_types,B1,wpm,B2
0,A1_a_good_nights_sleep,A1,0.233333,0.728972,4.347826,110.973451,0.483092
1,A1_a_request_from_your_boss,A1,0.076923,0.642857,2.352941,135.000000,1.176471
2,A1_a_voicemail_message,A1,0.187500,0.666667,0.709220,103.235294,2.127660
3,A1_A2_episode_01_they_meet,A1,0.000000,0.614035,0.769231,49.390244,0.000000
4,A1_A2_episode_02_toms_party,A1,0.035714,0.539683,3.225806,57.200000,0.000000
...,...,...,...,...,...,...,...
723,C2Prof_16-20,B2,0.378947,0.793210,7.692308,201.115880,1.923077
724,C2Prof_21-30,B2,0.454545,0.772549,4.362416,218.282209,1.845638
725,C2Prof_3-4,B2,0.266667,0.660377,5.882353,182.686567,0.980392
726,C2Prof_5-6,B2,0.434783,0.660194,3.389831,178.000000,1.694915


Normalize the data
- the scales are all different for the TAALES, etc variables

In [2]:
from sklearn.preprocessing import StandardScaler

# Columns to normalize
columns_to_normalize = ['Dep_Clauses_per_Clause', 'lexical_density_types', 'B1', 'wpm', 'B2']

# Create a StandardScaler object
scaler = StandardScaler()

# Normalize the columns
df[columns_to_normalize] = scaler.fit_transform(df[columns_to_normalize])

df


Unnamed: 0,filename,cefr,Dep_Clauses_per_Clause,lexical_density_types,B1,wpm,B2
0,A1_a_good_nights_sleep,A1,-0.225103,0.322803,0.066933,-0.855838,-0.854962
1,A1_a_request_from_your_boss,A1,-1.378203,-1.038086,-0.768515,-0.220806,-0.307815
2,A1_a_voicemail_message,A1,-0.562999,-0.661819,-1.456898,-1.060361,0.442769
3,A1_A2_episode_01_they_meet,A1,-1.945302,-1.493566,-1.431766,-2.483510,-1.236170
4,A1_A2_episode_02_toms_party,A1,-1.682006,-2.668574,-0.402963,-2.277094,-1.236170
...,...,...,...,...,...,...,...
723,C2Prof_16-20,B2,0.848404,1.337967,1.467586,1.526667,0.281333
724,C2Prof_21-30,B2,1.405735,1.011459,0.073043,1.980380,0.220225
725,C2Prof_3-4,B2,0.020640,-0.761211,0.709586,1.039572,-0.462541
726,C2Prof_5-6,B2,1.260038,-0.764105,-0.334271,0.915704,0.101290


I adapted the code, so the texts in the training set could be bootstrapped (using resample in sklearn)
- maybe this code is not DRY (I think there are 1 or 2 uneccesary steps and it could be cleaner, but it works)

In [3]:
import ast
from sklearn.utils import resample

# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(['cefr', 'filename'], axis=1), df['cefr'], test_size=0.2, random_state=160923
)

# Convert the training set lists to a DataFrame
df_train = pd.concat([X_train, y_train], axis=1)

# Separate the classes in the training set
class_A1 = df_train[df_train['cefr'] == "A1"]
class_A2 = df_train[df_train['cefr'] == "A2"]
class_B1 = df_train[df_train['cefr'] == "B1"]
class_B2 = df_train[df_train['cefr'] == "B2"]

# Bootstrap each class in the training set to have 500 samples
class_A1_sampled = resample(class_A1, replace=True, n_samples=500, random_state=160923)
class_A2_sampled = resample(class_A2, replace=True, n_samples=500, random_state=160923)
class_B1_sampled = resample(class_B1, replace=True, n_samples=500, random_state=160923)
class_B2_sampled = resample(class_B2, replace=True, n_samples=500, random_state=160923)

# Concatenate the bootstrapped classes back together
df_train_sampled = pd.concat([class_A1_sampled, class_A2_sampled, class_B1_sampled, class_B2_sampled])

# Now can use df_train_sampled for machine learning tasks
X_train_sampled = df_train_sampled.drop('cefr', axis=1)
y_train_sampled = df_train_sampled.cefr

RandomForestClassifier

##### This is the best performing for linguistic features

In [4]:
# train random forest classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.60      0.75      0.67         8
          A2       0.64      0.69      0.67        13
          B1       0.54      0.57      0.56        44
          B2       0.78      0.73      0.75        81

    accuracy                           0.68       146
   macro avg       0.64      0.68      0.66       146
weighted avg       0.68      0.68      0.68       146

Confusion Matrix:
[[ 6  2  0  0]
 [ 2  9  2  0]
 [ 1  1 25 17]
 [ 1  2 19 59]]


Try Support Vector Machines

In [5]:
from sklearn import svm

# train SVM classifier
clf = svm.SVC(probability=True)
clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.56      0.62      0.59         8
          A2       0.39      0.69      0.50        13
          B1       0.51      0.55      0.53        44
          B2       0.82      0.68      0.74        81

    accuracy                           0.64       146
   macro avg       0.57      0.64      0.59       146
weighted avg       0.67      0.64      0.65       146

Confusion Matrix:
[[ 5  3  0  0]
 [ 2  9  2  0]
 [ 1  7 24 12]
 [ 1  4 21 55]]


K-Nearest Neighbors (KNN): This is a type of instance-based learning that classifies a data point based on how its neighbors are classified.


In [6]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=3)

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)


              precision    recall  f1-score   support

          A1       0.33      0.25      0.29         8
          A2       0.42      0.62      0.50        13
          B1       0.40      0.48      0.44        44
          B2       0.70      0.59      0.64        81

    accuracy                           0.54       146
   macro avg       0.46      0.48      0.47       146
weighted avg       0.56      0.54      0.55       146

Confusion Matrix:
[[ 2  6  0  0]
 [ 2  8  2  1]
 [ 1  2 21 20]
 [ 1  3 29 48]]


Neural Networks: Deep learning models, especially neural networks, can be very effective on tasks with large amounts of data and many input features. Scikit-learn provides simple neural networks models through the MLPClassifier class.


In [7]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(hidden_layer_sizes=(10, 10, 10))

clf.fit(X_train_sampled, y_train_sampled)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

report = classification_report(y_test, preds)
print(report)

# print confusion matrix
cm = confusion_matrix(y_test, preds)
print("Confusion Matrix:")
print(cm)

              precision    recall  f1-score   support

          A1       0.75      0.75      0.75         8
          A2       0.38      0.69      0.49        13
          B1       0.52      0.50      0.51        44
          B2       0.83      0.74      0.78        81

    accuracy                           0.66       146
   macro avg       0.62      0.67      0.63       146
weighted avg       0.69      0.66      0.67       146

Confusion Matrix:
[[ 6  2  0  0]
 [ 2  9  2  0]
 [ 0 10 22 12]
 [ 0  3 18 60]]




In [10]:
len(X_train[0])

1536