In [72]:
import pandas as pd

df = pd.read_csv('analyzed.csv')
df = df.drop(['text', 'lemmatized', 'lemmatized_clean'], axis=1)
df.groupby('label').mean().head()

Unnamed: 0_level_0,word_freq_std,word_freq_mean,cefr_A1,cefr_A2,cefr_B1,cefr_B2,cefr_C1
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
A1,0.000284,0.00017,30.559028,8.288194,4.597222,3.142361,1.75
A2,0.000325,0.000189,67.643382,20.150735,11.915441,8.742647,3.459559
B1,0.000367,0.000219,102.736585,41.473171,25.707317,23.253659,8.42439
B2,0.000357,0.000209,107.332168,51.674825,32.884615,33.482517,14.517483
C1,0.000328,0.00018,132.556017,76.406639,50.850622,52.912863,23.53112


In [73]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

X = df.drop('label', axis=1)
y = df['label']

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

# Initialize and train the XGBoost classifier
#classifier = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.57
Classification Report:
              precision    recall  f1-score   support

          A1       0.73      0.85      0.79        67
          A2       0.47      0.50      0.49        52
          B1       0.48      0.36      0.41        36
          B2       0.50      0.50      0.50        54
          C1       0.48      0.41      0.44        51
          C2       0.61      0.64      0.62        39

    accuracy                           0.57       299
   macro avg       0.55      0.54      0.54       299
weighted avg       0.56      0.57      0.56       299



In [74]:
import numpy as np

# Custom evaluation metrics
def custom_metrics(y_true, y_pred):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    correct = np.abs(y_true - y_pred) <= 1
    
    tp = np.sum(correct)
    fp = np.sum(~correct)
    fn = fp  # Since every incorrect prediction is a false positive and a false negative in this context

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    accuracy = tp / len(y_true)
    
    return accuracy, precision, recall, f1

metrics = custom_metrics(y_test, y_pred)
print(metrics)

(0.9163879598662207, 0.9163879598662207, 0.9163879598662207, 0.9163879598662207)


In [75]:
import joblib
# Save the model to a file
joblib.dump(classifier, 'cefr_classifier.joblib')
joblib.dump(label_encoder, 'cefr_label_encoder.joblib')

['cefr_label_encoder.joblib']