In [7]:
import os
import math
import json
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import datetime
# import warnings
# import spacy
from itertools import combinations


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
# read the intermediate from the simple_feature_extraction script
basic_feature_train_df = pd.read_csv('basic_feature_train_data.csv') 
raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')


In [3]:
feature_cols = ['word_count', 'sent_count', 'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
                'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu', 'pos', 'compound', 'cohesion', 'AoA_score']

def data_selection(basic_feature_train_df, feature_cols, essay_set, text_set):
    df = basic_feature_train_df.copy()
    df = df[df['essay_set'].isin(essay_set)]
    df = df[~(df['essay_id'] == 10001)] # the text is "NO IMAGE"
    df = df[~df['cohesion'].isna()]

    if text_set == 'subset':
        X_train = df.loc[df['text_set'] != 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] != 'text_original', 'domain1_score'])
    elif text_set == 'original':
        X_train = df.loc[df['text_set'] == 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] == 'text_original', 'domain1_score'])
        
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    return X_train, y_train

In [4]:
def output_to_file(results_filename, grid_search, overall_params, param_grid): 
    with open(results_filename, 'w') as f:
        # Save param_grid as a JSON object
        cv_results_serializable = {key: (value.tolist() if isinstance(value, np.ndarray) else value)
                                    for key, value in grid_search.cv_results_.items()}

        # Save param_grid and cv_results_ as a JSON object
        json.dump({
            'overall_params': overall_params,
            'param_grid': param_grid,
            'cv_results': cv_results_serializable
        }, f)

In [13]:
def main_gridsearch(text_set_, essay_set_):
    mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Minimize MSE
    qwk_scorer = make_scorer(cohen_kappa_score, weights="quadratic", greater_is_better=True) # maximize QWK
    
    overall_params = {
        'model': 'Logistic Regression', 
        'text_set': text_set_,      # 'subset', or 'original' 
        'essay_set': essay_set_,          # [1, 3, 4, 5, 6]
        'scorer': 'qwk' # qwk_scorer, mse_scorer, or 'accuracy'
    }
    
    scorers = {'accuracy': 'accuracy', 
              'qwk': qwk_scorer, 
              'mse': mse_scorer}
    
    # Define the pipeline with scaling, PCA, and SVM
    pipeline = Pipeline([
        ('scaler', StandardScaler()),  # Step 1: Standardize features
        ('pca', PCA()),                # Step 2: Perform PCA
        ('logreg', LogisticRegression(random_state=42))  # XGBoost model
    ])
    
    # Define the parameter grid
    param_grid = {
        'pca__n_components': [0.9],                  # Number of PCA components to keep
        'logreg__penalty': ['l2'],                    # Regularization type (L2 penalty is standard)
        'logreg__C': [0.01, 0.1, 1, 10, 100],         # Inverse of regularization strength
        'logreg__solver': ['lbfgs', 'saga'],     # Solvers to use (both support L2 penalty)
        'logreg__max_iter': [100, 200, 500, 1000]                # Max number of iterations for solver
    }

    
    X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)  # Transform labels to range [0, n_classes-1]
    
    
    start_time = time.time()
    
    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorers[overall_params['scorer']], verbose=1)
    grid_search.fit(X_train, y_train)
    
    print(f"Time taken for basic statistical feature extraction: {(time.time() - start_time):.5f} seconds")
    # display(pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']])
    
    ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
    results_filename = 'output/grid_search_'+overall_params['model']+'_'+overall_params['text_set']+'_essay'+str(essay_set_)+'_'+ts+'_.txt'
    output_to_file(results_filename, grid_search, overall_params, param_grid)
    
    print('output_generated:', results_filename)
    with open(results_filename, 'r') as f:
        data = json.load(f)
    
    # Extract the stored param_grid and cv_results_
    overall_params, param_grid, cv_results = data['overall_params'], data['param_grid'], data['cv_results']
    
    cv_results_df = pd.DataFrame(cv_results)
    cv_results_df.sort_values('rank_test_score')[['param_pca__n_components', 
                                                  'param_logreg__penalty','param_logreg__C','param_logreg__solver','param_logreg__max_iter',
                                                  'mean_test_score', 'rank_test_score']]

In [14]:
text_set_selection = ['original', 'subset']
#essay_set_selection = [1, 3, 4, 5, 6, [1, 3, 4, 5, 6]]
essay_set_selection = [[1], [3], [4], [5], [6], [1, 3, 4, 5, 6]]

for text_set_ in text_set_selection:
    for essay_set_ in essay_set_selection:
        print('#########', text_set_, essay_set_)
        main_gridsearch(text_set_, essay_set_)

######### original [1]
X_train: (1783, 15)
y_train: (1783,)
Fitting 5 folds for each of 40 candidates, totalling 200 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Time taken for basic statistical feature extraction: 13.47246 seconds
output_generated: output/grid_search_Logistic Regression_original_essay[1]_20241117_1959_.txt
######### subset [1]
X_train: (5346, 15)
y_train: (5346,)
Fitting 5 folds for each of 40 candidates, totalling 200 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Time taken for basic statistical feature extraction: 76.67940 seconds
output_generated: output/grid_search_Logistic Regression_subset_essay[1]_20241117_2000_.txt
