In [None]:
import pandas as pd
import csv

In [None]:
!pip install nltk spacy textstat scikit-learn
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
import nltk
import spacy
import textstat
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Download NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Load Spacy model
nlp = spacy.load('en_core_web_sm')

# Define feature extraction functions
def word_count(text):
    return len(nltk.word_tokenize(text))

def syllable_count(text):
    return textstat.syllable_count(text)

def character_count(text):
    return len(text)

def complex_word_count(text):
    return textstat.lexicon_count(text, removepunct=True) - textstat.difficult_words(text)

def vocab_size(text):
    return len(set(nltk.word_tokenize(text)))

def lexical_diversity(text):
    words = nltk.word_tokenize(text)
    return len(set(words)) / len(words)

def noun_chunks(text):
    doc = nlp(text)
    return len(list(doc.noun_chunks))

def flesch_kincaid_score(text):
    return textstat.flesch_kincaid_grade(text)

def dale_chall_score(text):
    return textstat.dale_chall_readability_score(text)

def gunning_fog_index(text):
    return textstat.gunning_fog(text)

def coleman_liau_index(text):
    return textstat.coleman_liau_index(text)

def automated_readability_index(text):
    return textstat.automated_readability_index(text)

# create df with features
def extract_features(df, text_column):
    features = pd.DataFrame()
    features['Word Count'] = df[text_column].apply(word_count)
    features['Syllable Count'] = df[text_column].apply(syllable_count)
    features['Character Count'] = df[text_column].apply(character_count)
    features['Complex Word Count'] = df[text_column].apply(complex_word_count)
    features['Vocab Size'] = df[text_column].apply(vocab_size)
    features['Lexical Diversity'] = df[text_column].apply(lexical_diversity)
    features['Noun Chunks'] = df[text_column].apply(noun_chunks)
    features['Flesch Kincaid Score'] = df[text_column].apply(flesch_kincaid_score)
    features['Dale Chall Score'] = df[text_column].apply(dale_chall_score)
    features['Gunning Fog Index'] = df[text_column].apply(gunning_fog_index)
    features['Coleman Liau Index'] = df[text_column].apply(coleman_liau_index)
    features['Automated Readability Index'] = df[text_column].apply(automated_readability_index)
    return features


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Assuming 'balanced_data' is your dataframe and 'text' is the column with text data
from sklearn.impute import SimpleImputer

balanced_data = pd.read_csv('/content/efcamdat_sub.csv')

#balanced_data = balanced_data.sample(n=15000, random_state=42)

print(balanced_data['cefr_numeric'].value_counts())

balanced_data['label'] = balanced_data['cefr_numeric'].apply(lambda x: x - 1)
balanced_data['label'] = balanced_data['label'].astype('category')
print(balanced_data['label'].cat.categories)

features = extract_features(balanced_data, 'text')

data_with_features = pd.concat([features, balanced_data['label'].reset_index(drop=True)], axis=1)

# Drop rows with missing values in the label column
data_with_features = data_with_features.dropna(subset=['label'])

X = data_with_features.drop(columns=['label'])
y = data_with_features['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values using imputer for features only
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Verify the lengths of the training and testing sets
print(f"X_train_imputed length: {len(X_train_imputed)}, y_train length: {len(y_train)}")
print(f"X_test_imputed length: {len(X_test_imputed)}, y_test length: {len(y_test)}")

# Train logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_imputed, y_train)

# Predict on test set
y_pred = log_reg.predict(X_test_imputed)

# Print classification report
print(classification_report(y_test, y_pred))

cefr_numeric
3    100000
2    100000
1    100000
4     61329
5     14698
6      1940
Name: count, dtype: int64
Index([0, 1, 2, 3, 4, 5], dtype='int64')


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Train SVM model
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_imputed, y_train)

# Predict on test set
y_pred_svm = svm_model.predict(X_test_imputed)

# Print classification report
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.naive_bayes import GaussianNB

# Train Naive Bayes model
gnb_model = GaussianNB()
gnb_model.fit(X_train_imputed, y_train)

# Predict on test set
y_pred_gnb = gnb_model.predict(X_test_imputed)

# Print classification report
print("Gaussian Naive Bayes Classification Report:")
print(classification_report(y_test, y_pred_gnb))