In [7]:
import os
import math
import json
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import datetime
# import warnings
# import spacy
from itertools import combinations


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
# read the intermediate from the simple_feature_extraction script
basic_feature_train_df = pd.read_csv('basic_feature_train_data.csv') 
raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')


In [3]:
feature_cols = ['word_count', 'sent_count', 'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
                'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu', 'pos', 'compound', 'cohesion', 'AoA_score']

def data_selection(basic_feature_train_df, feature_cols, essay_set, text_set):
    df = basic_feature_train_df.copy()
    df = df[df['essay_set'].isin(essay_set)]
    df = df[~(df['essay_id'] == 10001)] # the text is "NO IMAGE"
    df = df[~df['cohesion'].isna()]

    if text_set == 'subset':
        X_train = df.loc[df['text_set'] != 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] != 'text_original', 'domain1_score'])
    elif text_set == 'original':
        X_train = df.loc[df['text_set'] == 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] == 'text_original', 'domain1_score'])
        
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    return X_train, y_train

In [4]:
def output_to_file(results_filename, grid_search, overall_params, param_grid): 
    with open(results_filename, 'w') as f:
        # Save param_grid as a JSON object
        cv_results_serializable = {key: (value.tolist() if isinstance(value, np.ndarray) else value)
                                    for key, value in grid_search.cv_results_.items()}

        # Save param_grid and cv_results_ as a JSON object
        json.dump({
            'overall_params': overall_params,
            'param_grid': param_grid,
            'cv_results': cv_results_serializable
        }, f)

In [5]:
def main_gridsearch(text_set_, essay_set_):
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Minimize MSE
    qwk_scorer = make_scorer(cohen_kappa_score, weights="quadratic", greater_is_better=True) # maximize QWK
    
    overall_params = {
        'model': 'XGB', 
        'text_set': text_set_,      # 'subset', or 'original' 
        'essay_set': essay_set_,          # [1, 3, 4, 5, 6]
        'scorer': 'qwk' # qwk_scorer, mse_scorer, or 'accuracy'
    }
    
    scorers = {'accuracy': 'accuracy', 
              'qwk': qwk_scorer, 
              'mse': mse_scorer}
    
    # Define the pipeline with scaling, PCA, and SVM
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('pca', PCA()),                # Step 2: Perform PCA
        ('xgb', XGBClassifier(eval_metric='logloss', random_state=42))  # XGBoost model
    ])
    
    # Define the parameter grid
    param_grid = {
        'pca__n_components': [0.9],                   # Number of PCA components to keep
        'xgb__n_estimators': [50, 100, 200],            # Number of trees
        'xgb__max_depth': [3, 10, 20],                   # Depth of trees
        'xgb__learning_rate': [0.01, 0.1, 0.3],         # Learning rate
        'xgb__subsample': [0.8, 1.0],                   # Fraction of samples used per tree
        'xgb__colsample_bytree': [0.8, 1.0]             # Fraction of features used per tree
    }

    
    X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)  # Transform labels to range [0, n_classes-1]
    
    
    start_time = time.time()
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorers[overall_params['scorer']], verbose=1)
    grid_search.fit(X_train, y_train)
    
    print(f"Time taken for basic statistical feature extraction: {(time.time() - start_time):.5f} seconds")
    # display(pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']])
    
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    results_filename = 'output/grid_search_'+overall_params['model']+'_'+overall_params['text_set']+'_essay'+str(essay_set_)+'_'+ts+'_.txt'
    output_to_file(results_filename, grid_search, overall_params, param_grid)
    
    print('output_generated:', results_filename)
    with open(results_filename, 'r') as f:
        data = json.load(f)
    
    # Extract the stored param_grid and cv_results_
    overall_params, param_grid, cv_results = data['overall_params'], data['param_grid'], data['cv_results']
    
    cv_results_df = pd.DataFrame(cv_results)
    cv_results_df.sort_values('rank_test_score')[['param_pca__n_components', 
                                                  'param_xgb__n_estimators','param_xgb__max_depth','param_xgb__learning_rate','param_xgb__subsample','param_xgb__colsample_bytree',
                                                  'mean_test_score', 'rank_test_score']]

In [6]:
text_set_selection = ['original', 'subset']
#essay_set_selection = [1, 3, 4, 5, 6, [1, 3, 4, 5, 6]]
essay_set_selection = [[1], [3], [4], [5], [6], [1, 3, 4, 5, 6]]

for text_set_ in text_set_selection:
    for essay_set_ in essay_set_selection:
        print('#########', text_set_, essay_set_)
        main_gridsearch(text_set_, essay_set_)

######### original [1]
X_train: (1783, 15)
y_train: (1783,)
Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV 1/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total 



[CV 3/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=20, xgb__n_estimators=200, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 4/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=20, xgb__n_estimators=200, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 5/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.01, xgb__max_depth=20, xgb__n_estimators=200, xgb__subsample=1.0;, score=nan total time=   0.0s
[CV 1/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 2/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=3, xgb__n_estimators=50, xgb__subsample=0.8;, score=nan total time=   0.0s
[CV 3/5] END pca__n_components=0.9, xgb__colsample_bytree=0.8, xgb__learning_rate=0.1, xgb__max_depth=3, x

ValueError: 
All the 540 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
432 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10], got [ 2  3  4  5  6  7  8  9 10 11 12]

--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/sklearn/pipeline.py", line 473, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/xgboost/core.py", line 726, in inner_f
    return func(**kwargs)
  File "/Users/yli/Library/Python/3.9/lib/python/site-packages/xgboost/sklearn.py", line 1491, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7 8 9], got [ 2  4  5  6  7  8  9 10 11 12]


X_train: (1800, 15)
y_train: (1800,)
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Time taken for basic statistical feature extraction: 7.63527 seconds
output_generated: output/grid_search_XGB_original_essay[1]_20241117_0951_.txt


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_xgb__n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.236456,0.00874,0.001773,0.000123,0.9,50,"{'pca__n_components': 0.9, 'xgb__n_estimators'...",0.651176,0.661786,0.652038,0.643137,0.697945,0.661216,0.019294,1
1,0.434588,0.001843,0.001955,6.1e-05,0.9,100,"{'pca__n_components': 0.9, 'xgb__n_estimators'...",0.668541,0.66344,0.656832,0.634253,0.671501,0.658913,0.013298,2
2,0.800569,0.003453,0.002874,0.000195,0.9,200,"{'pca__n_components': 0.9, 'xgb__n_estimators'...",0.655222,0.650218,0.660524,0.629317,0.66528,0.652112,0.012466,3


In [None]:
cv_results_df