In [6]:
import os
import math
import json
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import datetime
# import warnings
# import spacy
from itertools import combinations


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier

In [2]:
# read the intermediate from the simple_feature_extraction script
basic_feature_train_df = pd.read_csv('basic_feature_train_data.csv') 
raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')


In [3]:
feature_cols = ['word_count', 'sent_count', 'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
                'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu', 'pos', 'compound', 'cohesion', 'AoA_score']

def data_selection(basic_feature_train_df, feature_cols, essay_set, text_set):
    df = basic_feature_train_df.copy()
    df = df[df['essay_set'].isin(essay_set)]
    df = df[~(df['essay_id'] == 10001)] # the text is "NO IMAGE"
    df = df[~df['cohesion'].isna()]

    if text_set == 'subset':
        X_train = df.loc[df['text_set'] != 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] != 'text_original', 'domain1_score'])
    elif text_set == 'original':
        X_train = df.loc[df['text_set'] == 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] == 'text_original', 'domain1_score'])
        
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    return X_train, y_train

In [4]:
def output_to_file(results_filename, grid_search, overall_params, param_grid): 
    with open(results_filename, 'w') as f:
        # Save param_grid as a JSON object
        cv_results_serializable = {key: (value.tolist() if isinstance(value, np.ndarray) else value)
                                    for key, value in grid_search.cv_results_.items()}

        # Save param_grid and cv_results_ as a JSON object
        json.dump({
            'overall_params': overall_params,
            'param_grid': param_grid,
            'cv_results': cv_results_serializable
        }, f)

In [18]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Minimize MSE
qwk_scorer = make_scorer(cohen_kappa_score, weights="quadratic", greater_is_better=True) # maximize QWK

overall_params = {
    'model': 'Random Forest', 
    'text_set': 'original',      # 'subset', or 'original' 
    'essay_set': [1, 3, 4, 5, 6],          # [1, 3, 4, 5, 6]
    'scorer': 'qwk' # qwk_scorer, mse_scorer, or 'accuracy'
}

scorers = {'accuracy': 'accuracy', 
          'qwk': qwk_scorer, 
          'mse': mse_scorer}

# Define the pipeline with scaling, PCA, and SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('pca', PCA()),                # Step 2: Perform PCA
    ('rfc', RandomForestClassifier(random_state=42))                 # Step 3: Fit Random Forest
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [0.9],                   # Number of PCA components to keep
    'rfc__n_estimators': [50, 100, 200],      # Number of trees in the forest
    'rfc__max_depth': [None, 10, 20],         # Maximum depth of the tree
    'rfc__min_samples_split': [2, 5, 10],         # Minimum samples to split a node
    'rfc__min_samples_leaf': [1, 2, 5],          # Minimum samples per leaf
}

X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])

start_time = time.time()

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorers[overall_params['scorer']], verbose=1)
grid_search.fit(X_train, y_train)

print(f"Time taken for basic statistical feature extraction: {(time.time() - start_time):.5f} seconds")
# display(pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']])

ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
results_filename = 'output/grid_search'+overall_params['model']+'_'+overall_params['text_set']+'_'+ts+'_.txt'
output_to_file(results_filename, grid_search, overall_params, param_grid)

print('output_generated:', results_filename)
with open(results_filename, 'r') as f:
    data = json.load(f)

# Extract the stored param_grid and cv_results_
overall_params, param_grid, cv_results = data['overall_params'], data['param_grid'], data['cv_results']

cv_results_df = pd.DataFrame(cv_results)
cv_results_df.sort_values('rank_test_score')[['param_pca__n_components', 
                                              'param_rfc__n_estimators','param_rfc__max_depth','param_rfc__min_samples_split','param_rfc__min_samples_leaf',
                                              'mean_test_score', 'rank_test_score']]

X_train: (5386, 15)
y_train: (5386,)
Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.633 total time=   0.4s
[CV 2/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.641 total time=   0.4s
[CV 3/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.647 total time=   0.4s
[CV 4/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.644 total time=   0.4s
[CV 5/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_samples_split=2, rfc__n_estimators=50;, score=0.641 total time=   0.4s
[CV 1/5] END pca__n_components=0.9, rfc__max_depth=None, rfc__min_samples_leaf=1, rfc__min_

Unnamed: 0,param_pca__n_components,param_rfc__n_estimators,param_rfc__max_depth,param_rfc__min_samples_split,param_rfc__min_samples_leaf,mean_test_score,rank_test_score
61,0.9,100,20.0,10,1,0.649069,1
62,0.9,200,20.0,10,1,0.648863,2
1,0.9,100,,2,1,0.648724,3
17,0.9,200,,10,2,0.648495,4
16,0.9,100,,10,2,0.647597,5
...,...,...,...,...,...,...,...
49,0.9,100,10.0,5,5,0.630941,76
46,0.9,100,10.0,2,5,0.630941,76
51,0.9,50,10.0,10,5,0.630589,79
48,0.9,50,10.0,5,5,0.630589,79


In [8]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Minimize MSE
qwk_scorer = make_scorer(cohen_kappa_score, weights="quadratic", greater_is_better=True) # maximize QWK

overall_params = {
    'model': 'Random Forest', 
    'text_set': 'subset',      # 'subset', or 'original' 
    'essay_set': [1, 3, 4, 5, 6],          # [1, 3, 4, 5, 6]
    'scorer': 'qwk' # qwk_scorer, mse_scorer, or 'accuracy'
}

scorers = {'accuracy': 'accuracy', 
          'qwk': qwk_scorer, 
          'mse': mse_scorer}

# Define the pipeline with scaling, PCA, and SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('pca', PCA()),                # Step 2: Perform PCA
    ('rfc', RandomForestClassifier(random_state=42))                 # Step 3: Fit Random Forest
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [0.9],                   # Number of PCA components to keep
    'rfc__n_estimators': [50, 100, 200],      # Number of trees in the forest
    'rfc__max_depth': [None, 10, 20],         # Maximum depth of the tree
    'rfc__min_samples_split': [2, 5, 10],         # Minimum samples to split a node
    'rfc__min_samples_leaf': [1, 2, 5],          # Minimum samples per leaf
}

X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])

start_time = time.time()

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorers[overall_params['scorer']], verbose=1)
grid_search.fit(X_train, y_train)

print(f"Time taken for basic statistical feature extraction: {(time.time() - start_time):.5f} seconds")
# display(pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']])

ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
results_filename = 'output/grid_search'+overall_params['model']+'_'+overall_params['text_set']+'_'+ts+'_.txt'
output_to_file(results_filename, grid_search, overall_params, param_grid)

print('output_generated:', results_filename)
with open(results_filename, 'r') as f:
    data = json.load(f)

# Extract the stored param_grid and cv_results_
overall_params, param_grid, cv_results = data['overall_params'], data['param_grid'], data['cv_results']

cv_results_df = pd.DataFrame(cv_results)
cv_results_df.sort_values('rank_test_score')[['param_pca__n_components', 
                                              'param_rfc__n_estimators','param_rfc__max_depth','param_rfc__min_samples_split','param_rfc__min_samples_leaf',
                                              'mean_test_score', 'rank_test_score']]

Unnamed: 0,param_pca__n_components,param_rfc__n_estimators,param_rfc__max_depth,param_rfc__min_samples_split,param_rfc__min_samples_leaf,mean_test_score,rank_test_score
59,0.8,200,10.0,5,2,0.784240,1
50,0.8,200,10.0,2,1,0.783744,2
53,0.8,200,10.0,5,1,0.783608,3
70,0.8,100,20.0,5,2,0.783281,4
49,0.8,100,10.0,2,1,0.783258,5
...,...,...,...,...,...,...,...
23,0.9,200,10.0,5,2,0.758585,68
16,0.9,100,10.0,5,1,0.757276,69
18,0.9,50,10.0,2,2,0.756698,70
17,0.9,200,10.0,5,1,0.755324,71


In [None]:
# 'param_rfc__n_estimators','param_rfc__max_depth','param_rfc__min_samples_split','param_rfc__min_samples_leaf','param_rfc__bootstrap'