In [1]:
import os
import math
import json
import time
import random
import scipy
import regex as re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import datetime
# import warnings
# import spacy
from itertools import combinations


from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD, PCA

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, cohen_kappa_score, mean_squared_error, make_scorer
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
# read the intermediate from the simple_feature_extraction script
basic_feature_train_df = pd.read_csv('basic_feature_train_data.csv') 
raw_training_set = pd.read_csv('asap-aes/training_set_rel3.tsv',sep='\t', encoding='latin1')


In [4]:
feature_cols = ['word_count', 'sent_count', 'char_count', 'sent_length', 'spell_err_count', 'syllabus_count',
                'FleKin_score', 'DalCha_score', 'unique_word_count', 'neg', 'neu', 'pos', 'compound', 'cohesion', 'AoA_score']

def data_selection(basic_feature_train_df, feature_cols, essay_set, text_set):
    df = basic_feature_train_df.copy()
    df = df[df['essay_set'].isin(essay_set)]
    df = df[~(df['essay_id'] == 10001)] # the text is "NO IMAGE"
    df = df[~df['cohesion'].isna()]

    if text_set == 'subset':
        X_train = df.loc[df['text_set'] != 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] != 'text_original', 'domain1_score'])
    elif text_set == 'original':
        X_train = df.loc[df['text_set'] == 'text_original', feature_cols]
        y_train = np.array(df.loc[df['text_set'] == 'text_original', 'domain1_score'])
        
    print('X_train:', X_train.shape)
    print('y_train:', y_train.shape)

    return X_train, y_train

In [9]:
def output_to_file(results_filename, grid_search, overall_params, param_grid): 
    with open(results_filename, 'w') as f:
        # Save param_grid as a JSON object
        cv_results_serializable = {key: (value.tolist() if isinstance(value, np.ndarray) else value)
                                    for key, value in grid_search.cv_results_.items()}

        # Save param_grid and cv_results_ as a JSON object
        json.dump({
            'overall_params': overall_params,
            'param_grid': param_grid,
            'cv_results': cv_results_serializable
        }, f)

In [22]:
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Minimize MSE
qwk_scorer = make_scorer(cohen_kappa_score, weights="quadratic", greater_is_better=True) # maximize QWK

overall_params = {
    'model': 'SVM', 
    'text_set': 'original',      # 'subset', or 'original' 
    'essay_set': [1, 3, 4, 5, 6],          # [1, 3, 4, 5, 6]
    'scorer': 'qwk' # qwk_scorer, mse_scorer, or 'accuracy'
}

scorers = {'accuracy': 'accuracy', 
          'qwk': qwk_scorer, 
          'mse': mse_scorer}

# Define the pipeline with scaling, PCA, and SVM
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Step 1: Standardize features
    ('pca', PCA()),                # Step 2: Perform PCA
    ('svm', SVC())                 # Step 3: Fit SVM
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [0.9, 0.8],                   # Number of PCA components to keep
    'svm__C': [0.001, 0.1, 1, 10],                # Regularization parameter
    'svm__gamma': ['scale', 'auto', 0.1, 0.01],   # Kernel coefficient
    'svm__kernel': ['linear', 'rbf',],            # Kernel type
}

X_train, y_train = data_selection(basic_feature_train_df, feature_cols, overall_params['essay_set'], overall_params['text_set'])

start_time = time.time()

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=scorers[overall_params['scorer']], verbose=4)
grid_search.fit(X_train, y_train)

print(f"Time taken for basic statistical feature extraction: {(time.time() - start_time):.5f} seconds")
# display(pd.DataFrame(grid_search.cv_results_).sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']])

ts = datetime.datetime.now().strftime("%Y%m%d_%H%M")
results_filename = 'output/grid_search'+overall_params['model']+'_'+overall_params['text_set']+'_'+ts+'_.txt'
output_to_file(results_filename, grid_search, overall_params, param_grid)

print('output_generated:', results_filename)
with open(results_filename, 'r') as f:
    data = json.load(f)

# Extract the stored param_grid and cv_results_
overall_params, param_grid, cv_results = data['overall_params'], data['param_grid'], data['cv_results']

cv_results_df = pd.DataFrame(cv_results)
cv_results_df.sort_values('rank_test_score')[['param_pca__n_components', 'param_svm__C', 'param_svm__kernel', 'mean_test_score', 'rank_test_score']]

X_train: (1800, 15)
y_train: (1800,)
Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV 1/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=linear;, score=0.179 total time=   0.0s
[CV 2/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=linear;, score=0.087 total time=   0.0s
[CV 3/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=linear;, score=0.197 total time=   0.0s
[CV 4/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=linear;, score=0.139 total time=   0.0s
[CV 5/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=linear;, score=0.153 total time=   0.0s
[CV 1/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=rbf;, score=0.000 total time=   0.0s
[CV 2/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale, svm__kernel=rbf;, score=0.000 total time=   0.0s
[CV 3/5] END pca__n_components=0.9, svm__C=0.001, svm__gamma=scale,

Unnamed: 0,param_pca__n_components,param_svm__C,param_svm__kernel,mean_test_score,rank_test_score
60,0.8,10.000,linear,0.671855,1
58,0.8,10.000,linear,0.671855,1
62,0.8,10.000,linear,0.671855,1
56,0.8,10.000,linear,0.671855,1
54,0.8,1.000,linear,0.669903,5
...,...,...,...,...,...
35,0.8,0.001,rbf,0.000000,57
3,0.9,0.001,rbf,0.000000,57
39,0.8,0.001,rbf,0.000000,57
1,0.9,0.001,rbf,0.000000,57
