In [1]:
import os
import math
import json
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import datetime
# import warnings
# import spacy
from itertools import combinations


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, make_scorer, mean_absolute_error
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from IPython.display import clear_output
from xgboost import XGBClassifier
from xgboost import XGBRegressor

In [2]:
basic_feature_train_df = pd.read_csv('scaled_feature_train_data.csv') 
test_data = pd.read_csv('basic_test_data.csv')


In [3]:
print('Training data')
print(basic_feature_train_df.shape)
print(basic_feature_train_df.columns)

print('Test data')
print(test_data.shape)
print(test_data.columns)

Training data
(28436, 25)
Index(['essay_id', 'essay_set', 'text_set', 'text', 'word_tokens',
       'sent_tokens', 'word_tokens_clean', 'word_count', 'sent_count',
       'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
       'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu',
       'pos', 'compound', 'cohesion', 'AoA_score', 'rescaled_score',
       'low_med_hi', 'low_med_hi_numeric'],
      dtype='object')
Test data
(7108, 25)
Index(['essay_id', 'essay_set', 'text_set', 'text', 'word_tokens',
       'sent_tokens', 'word_tokens_clean', 'word_count', 'sent_count',
       'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
       'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu',
       'pos', 'compound', 'cohesion', 'AoA_score', 'rescaled_score',
       'low_med_hi', 'low_med_hi_numeric'],
      dtype='object')


In [4]:
feature_cols = ['word_count', 'sent_count', 'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
                'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu', 'pos', 'compound', 'cohesion', 'AoA_score']

def data_selection(data, feature_cols, essay_set, text_set, is_test = False, is_regression = False, labelencoder = False):
    df = data.copy()
    df = df[df['essay_set'].isin(essay_set)]
    df = df[~df['rescaled_score'].isna()]
    df = df[~df['cohesion'].isna()]
    #print(len(df))
    if is_test == False:
       df = df[~(df['essay_id'] == 10001)] # the text is "NO IMAGE"
    

    if text_set == 'subset':
        X = df.loc[df['text_set'] != 'text_original', feature_cols]
        if is_regression:
            y = np.array(df.loc[df['text_set'] != 'text_original', 'rescaled_score'])
        else:
            y = np.array(df.loc[df['text_set'] != 'text_original', 'rescaled_score']).round()
    elif text_set == 'original':
        X = df.loc[df['text_set'] == 'text_original', feature_cols]
        if is_regression:
            y = np.array(df.loc[df['text_set'] == 'text_original', 'rescaled_score'])
        else:
            y = np.array(df.loc[df['text_set'] == 'text_original', 'rescaled_score']).round()

    if labelencoder:
        label_encoder = LabelEncoder()
        y = label_encoder.fit_transform(y)

    #print('>>> is_test:', is_test)
    print('X:', X.shape, 'y:', y.shape)

    return X, y

In [5]:
def test_randforest_classifier(text_set_, essay_set_):
    overall_params = {
        'model': 'Random Forest', 
        'text_set': text_set_,      # 'subset', or 'original' 
        'essay_set': essay_set_,          # [1, 3, 4, 5, 6]
        }

    print("#######", overall_params)
    
    X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])
    X_test, y_test = data_selection(test_data, feature_cols, overall_params['essay_set'], overall_params['text_set'], is_test = True)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('pca', PCA(n_components=0.9)),  # Step 2: Perform PCA (reduce to 2 components)
        ('rf', RandomForestClassifier(random_state=42, max_depth = 20, 
                                        min_samples_leaf = 2, 
                                        min_samples_split = 8,
                                        n_estimators = 150))  # Step 3: Random Forest
    ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Test the pipeline
    y_pred = pipeline.predict(X_test)
    
    qwk  = cohen_kappa_score(y_pred, y_test, weights = 'quadratic')
    

    return y_pred, qwk

In [6]:
def test_xgb_classifier(text_set_, essay_set_):
    overall_params = {
        'model': 'XGB Classifier', 
        'text_set': text_set_,      # 'subset', or 'original' 
        'essay_set': essay_set_,          # [1, 3, 4, 5, 6]
        }

    print("#######", overall_params)

    X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'], is_test = False, is_regression = False, labelencoder = True)
    X_test, y_test = data_selection(test_data, feature_cols, overall_params['essay_set'], overall_params['text_set'], is_test = True, is_regression = False, labelencoder = True)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('pca', PCA(n_components=0.9)),  # Step 2: Perform PCA
        ('xgb', XGBClassifier(eval_metric=mean_absolute_error, 
                                random_state=45, 
                                colsample_bytree = 0.8, 
                                gamma = 1,
                                learning_rate = 0.3, 
                                max_depth = 3, 
                                n_estimators = 100))  
                                ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Test the pipeline
    y_pred = pipeline.predict(X_test)
    
    qwk  = cohen_kappa_score(y_pred, y_test, weights = 'quadratic')

    return y_pred, qwk


In [7]:
def test_xgb_regressor(text_set_, essay_set_):
    overall_params = {
        'model': 'XGB Regressor', 
        'text_set': text_set_,      # 'subset', or 'original' 
        'essay_set': essay_set_,          # [1, 3, 4, 5, 6]
        }

    print("#######", overall_params)

    X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'], is_test = False, is_regression = True, labelencoder = True)
    X_test, y_test = data_selection(test_data, feature_cols, overall_params['essay_set'], overall_params['text_set'], is_test = True, is_regression = True, labelencoder = True)
    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('pca', PCA(n_components=0.9)),  # Step 2: Perform PCA
        ('xgb', XGBRegressor(random_state=42,
                            gamma=1, learning_rate=0.2, max_depth=2, n_estimators=100, reg_alpha=0.1, reg_lambda=1))
                                ])
    
    # Train the pipeline
    pipeline.fit(X_train, y_train)
    
    # Test the pipeline
    y_pred = pipeline.predict(X_test)
    y_pred = y_pred.round().astype(int)
    qwk  = cohen_kappa_score(y_pred, y_test, weights = 'quadratic')
    
    return y_pred, qwk


In [17]:
# rfc_class_ypred = []
# xgb_class_ypred = []
# xgb_regre_ypred = []

# rfc_class_qwk = []
# xgb_class_qwk = []
# xgb_regre_qwk = []

ypred_ls = []
qwk_ls = []
label_ls = []


text_set_selection = ['original', 'subset']
essay_set_selection = [[1], [3], [4], [5], [6], [1, 3, 4, 5, 6]]

for text_set_ in text_set_selection:
    for essay_set_ in essay_set_selection:
        y_pred, qwk = test_randforest_classifier(text_set_, essay_set_)
        ypred_ls.append(y_pred)
        qwk_ls.append(qwk)
        label_ls.append(text_set_+'_rfc;'+str(essay_set_))

for text_set_ in text_set_selection:
    for essay_set_ in essay_set_selection:
        y_pred, qwk = test_xgb_classifier(text_set_, essay_set_)
        ypred_ls.append(y_pred)
        qwk_ls.append(qwk)
        label_ls.append(text_set_+'_xgbcla;'+str(essay_set_))

for text_set_ in text_set_selection:
    for essay_set_ in essay_set_selection:
        y_pred, qwk = test_xgb_regressor(text_set_, essay_set_)
        ypred_ls.append(y_pred)
        qwk_ls.append(qwk)
        label_ls.append(text_set_+'_xgbreg;'+str(essay_set_))
        

####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [1]}
X: (1426, 15) y: (1426,)
X: (357, 15) y: (357,)
####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [3]}
X: (1381, 15) y: (1381,)
X: (344, 15) y: (344,)
####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [4]}
X: (1416, 15) y: (1416,)
X: (352, 15) y: (352,)
####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [5]}
X: (1443, 15) y: (1443,)
X: (361, 15) y: (361,)
####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [6]}
X: (1440, 15) y: (1440,)
X: (358, 15) y: (358,)
####### {'model': 'Random Forest', 'text_set': 'original', 'essay_set': [1, 3, 4, 5, 6]}
X: (7106, 15) y: (7106,)
X: (1772, 15) y: (1772,)
####### {'model': 'Random Forest', 'text_set': 'subset', 'essay_set': [1]}
X: (4270, 15) y: (4270,)
X: (1069, 15) y: (1069,)
####### {'model': 'Random Forest', 'text_set': 'subset', 'essay_set': [3]}
X: (4075, 15) y: (4075,)
X: (1011, 1

In [35]:
output_df = pd.DataFrame({'label': label_ls,'qwk': qwk_ls})
output_df[['label', 'essay_set']] = output_df['label'].str.split(';', expand=True)
output_df = output_df[['label', 'essay_set', 'qwk']]
output_df

Unnamed: 0,label,essay_set,qwk
0,original_rfc,[1],0.741647
1,original_rfc,[3],0.632297
2,original_rfc,[4],0.696754
3,original_rfc,[5],0.74657
4,original_rfc,[6],0.664556
5,original_rfc,"[1, 3, 4, 5, 6]",0.683907
6,subset_rfc,[1],0.712643
7,subset_rfc,[3],0.616641
8,subset_rfc,[4],0.654456
9,subset_rfc,[5],0.739422


In [37]:
output_df.to_csv('test_basic_features_qwk.csv')

In [38]:
data_dict = {col_name: col_values for col_name, col_values in zip(label_ls, ypred_ls)}

# Create a DataFrame (pandas will automatically align lengths and fill missing values with NaN)
ypred_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data_dict.items()]))
ypred_df.to_csv('test_basic_features_ypred.csv')