In [1]:
!pip install --no-index -U --find-links=/kaggle/input/lightautoml-038-dependecies lightautoml==0.3.8
!pip install --no-index -U --find-links=/kaggle/input/lightautoml-038-dependecies pandas==2.0.3

Looking in links: /kaggle/input/lightautoml-038-dependecies
Processing /kaggle/input/lightautoml-038-dependecies/lightautoml-0.3.8-py3-none-any.whl
Processing /kaggle/input/lightautoml-038-dependecies/AutoWoE-1.3.2-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/cmaes-0.10.0-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/joblib-1.2.0-py3-none-any.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/json2html-1.3.0.tar.gz (from lightautoml==0.3.8)
  Preparing metadata (setup.py) ... [?25ldone
[?25hProcessing /kaggle/input/lightautoml-038-dependecies/lightgbm-3.2.1-py3-none-manylinux1_x86_64.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/pandas-1.5.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (from lightautoml==0.3.8)
Processing /kaggle/input/lightautoml-038-dependecies/poetry_core-1.8.1-py3-none-any.whl (from

## Set Global Configuration Options

In [11]:
class CONFIG:
    '''
    > General Options
    '''
    # global seed
    seed = 42
    # the number of samples to use for testing purposes
    # if None, we use the full dataset
    samples_testing = None #None
    # max rows to display for pandas dataframes
    display_max_rows = 200
    # name of the response variate we are trying to predict
    response_variate = 'score'
    # minimum value for response variate
    min_possible_response_value = 0.5
    # maximum value for response variate
    max_possible_response_value = 6.0
    
    '''
    > Feature Engineering Options
    '''
    # whether to use pre feature engineered data or not
    use_pre_fe_data = True
    # fe data saved path
    pre_fe_data_filepath = '/kaggle/input/writing-quality-baseline-v2-train-data/feat_eng_train_feats.csv'
    
    '''
    > Preprocessing Options
    '''
    # number of folds to split the data for CV
    num_folds = 10
    
    '''
    > Modelling + Training Options
    '''
    # the names of the models to use
    # either a list of model names, or 'all', in which case all models are used
    model_names = 'all'
    # number of trials to use for early stopping
    num_trials_early_stopping = 50
    # model path for lightautoml
    lightautoml_model_path = '/kaggle/input/writing-quality-baseline-v2-lightautoml/denselight.model'
    # oof preds path for lightautoml
    lightautoml_oof_preds_path = '/kaggle/input/writing-quality-baseline-v2-lightautoml/denselight_oof_preds'
    
    '''
    > Post-Modelling Options
    '''
    # number of most important features to display
    # for feature importances plots
    num_features_to_display = 50

## Import Libraries

In [12]:
import warnings

import os
import gc
import re
import random
from collections import Counter, defaultdict
import pprint
import pickle
import time
import copy

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.autonotebook import tqdm

# from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder, PowerTransformer, RobustScaler, FunctionTransformer
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import optuna

## Set Some Options

In [13]:
tqdm.pandas()
sns.set_style("whitegrid")

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", CONFIG.display_max_rows)
warnings.simplefilter('ignore')

random.seed(CONFIG.seed)

## Load Data

In [14]:
%%time
INPUT_DIR = '/kaggle/input/linking-writing-processes-to-writing-quality'
train_logs = pd.read_csv(f'{INPUT_DIR}/train_logs.csv')
train_scores = pd.read_csv(f'{INPUT_DIR}/train_scores.csv')
test_logs = pd.read_csv(f'{INPUT_DIR}/test_logs.csv')

CPU times: user 10.6 s, sys: 2.33 s, total: 13 s
Wall time: 13 s


## Subsample Data (If Specified)

In [15]:
if CONFIG.samples_testing is not None:
    ids = list(train_logs["id"].unique())
    sample_ids = random.sample(ids, CONFIG.samples_testing)
    train_logs = train_logs[train_logs["id"].isin(sample_ids)]

## Looking At Data

In [16]:
train_logs.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1


In [17]:
train_scores.head()

Unnamed: 0,id,score
0,001519c8,3.5
1,0022f953,3.5
2,0042269b,6.0
3,0059420b,2.0
4,0075873a,4.0


In [18]:
test_logs.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,1
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,1
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,0


## Feature Engineering

### Essay Constructor

In [20]:
class EssayConstructor:
    
    def processingInputs(self,currTextInput):
        # Where the essay content will be stored
        essayText = ""
        # Produces the essay
        for Input in currTextInput.values:
            # Input[0] = activity
            # Input[1] = cursor_position
            # Input[2] = text_change
            # Input[3] = id
            # If activity = Replace
            if Input[0] == 'Replace':
                # splits text_change at ' => '
                replaceTxt = Input[2].split(' => ')
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(replaceTxt[1])] + replaceTxt[1] + essayText[Input[1] - len(replaceTxt[1]) + len(replaceTxt[0]):]
                continue

            # If activity = Paste    
            if Input[0] == 'Paste':
                # DONT TOUCH
                essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
                continue

            # If activity = Remove/Cut
            if Input[0] == 'Remove/Cut':
                # DONT TOUCH
                essayText = essayText[:Input[1]] + essayText[Input[1] + len(Input[2]):]
                continue

            # If activity = Move...
            if "M" in Input[0]:
                # Gets rid of the "Move from to" text
                croppedTxt = Input[0][10:]              
                # Splits cropped text by ' To '
                splitTxt = croppedTxt.split(' To ')              
                # Splits split text again by ', ' for each item
                valueArr = [item.split(', ') for item in splitTxt]              
                # Move from [2, 4] To [5, 7] = (2, 4, 5, 7)
                moveData = (int(valueArr[0][0][1:]), int(valueArr[0][1][:-1]), int(valueArr[1][0][1:]), int(valueArr[1][1][:-1]))
                # Skip if someone manages to activiate this by moving to same place
                if moveData[0] != moveData[2]:
                    # Check if they move text forward in essay (they are different)
                    if moveData[0] < moveData[2]:
                        # DONT TOUCH
                        essayText = essayText[:moveData[0]] + essayText[moveData[1]:moveData[3]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[3]:]
                    else:
                        # DONT TOUCH
                        essayText = essayText[:moveData[2]] + essayText[moveData[0]:moveData[1]] + essayText[moveData[2]:moveData[0]] + essayText[moveData[1]:]
                continue                
                
            # If activity = input
            # DONT TOUCH
            essayText = essayText[:Input[1] - len(Input[2])] + Input[2] + essayText[Input[1] - len(Input[2]):]
        return essayText
            
            
    def getEssays(self,df):
        # Copy required columns
        textInputDf = copy.deepcopy(df[['id', 'activity', 'cursor_position', 'text_change']])
        # Get rid of text inputs that make no change
        textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']     
        # construct essay, fast 
        tqdm.pandas()
        essay=textInputDf.groupby('id')[['activity','cursor_position', 'text_change']].progress_apply(lambda x: self.processingInputs(x))      
        # to dataframe
        essayFrame=essay.to_frame().reset_index()
        essayFrame.columns=['id','essay']
        # Returns the essay series
        return essayFrame

## Preprocessor Class

In [21]:
# nth percentile function for agg
def percentile(n):
    def percentile_(x):
        return x.quantile(n/100)
    percentile_.__name__ = 'pct_{:02.0f}'.format(n)
    return percentile_

def q1(x):
    return x.quantile(0.25)

def q3(x):
    return x.quantile(0.75)

class Preprocessor:
    def __init__(self, seed):
        self.seed = seed
        
        self.activities = ['Input', 'Remove/Cut', 'Nonproduction', 'Replace', 'Paste']
        self.events = ['q', 'Space', 'Backspace', 'Shift', 'ArrowRight', 'Leftclick', 'ArrowLeft', '.', ',', 
              'ArrowDown', 'ArrowUp', 'Enter', 'CapsLock', "'", 'Delete', 'Unidentified']
        self.text_changes_dict = {
            'q': 'q', 
            ' ': 'space', 
            'NoChange': 'NoChange', 
            '.': 'full_stop', 
            ',': 'comma', 
            '\n': 'newline', 
            "'": 'single_quote', 
            '"': 'double_quote', 
            '-': 'dash', 
            '?': 'question_mark', 
            ';': 'semicolon', 
            '=': 'equals', 
            '/': 'slash', 
            '\\': 'double_backslash', 
            ':': 'colon'
        }
        self.punctuations = ['"', '.', ',', "'", '-', ';', ':', '?', '!', '<', '>', '/',
                        '@', '#', '$', '%', '^', '&', '*', '(', ')', '_', '+']
        self.gaps = [1, 2, 3, 5, 10, 20, 50, 70, 100]
        self.percentiles = [5, 10, 25, 50, 75, 90, 95]
        self.percentiles_cols = [percentile(n) for n in self.percentiles]
        self.aggregations = ['mean', 'std', 'min', 'max', 'first', 'last', 'sem', q1, 'median', q3, 'skew', pd.DataFrame.kurt, 'sum']
        self.idf = defaultdict(float)
        
        self.essay_constructor = EssayConstructor()
    
    def get_essay_aggregations(self, essay_df):
        cols_to_drop = ['essay']
        # Total essay length
        essay_df['essay_len'] = essay_df['essay'].apply(lambda x: len(x))
        essay_df = essay_df.drop(columns=cols_to_drop)
        return essay_df
    
    def split_essays_into_words(self, essay_df):
        essay_df['word'] = essay_df['essay'].apply(lambda x: re.split(' |\\n|\\.|\\?|\\!',x))
        essay_df = essay_df.explode('word')
        # Word length (number of characters in word)
        essay_df['word_len'] = essay_df['word'].apply(lambda x: len(x))
        essay_df = essay_df[essay_df['word_len'] != 0]
        return essay_df
    
    def compute_word_aggregations(self, word_df):
        word_agg_df = word_df[['id','word_len']].groupby(['id']).agg(self.aggregations)
        word_agg_df.columns = ['_'.join(x) for x in word_agg_df.columns]
        word_agg_df['id'] = word_agg_df.index
        # New features: computing the # of words whose length exceed word_l
        for word_l in [5, 6, 7, 8, 9, 10, 11, 12]:
            word_agg_df[f'word_len_ge_{word_l}_count'] = word_df[word_df['word_len'] >= word_l].groupby(['id']).count().iloc[:, 0]
            word_agg_df[f'word_len_ge_{word_l}_count'] = word_agg_df[f'word_len_ge_{word_l}_count'].fillna(0)
        word_agg_df = word_agg_df.reset_index(drop=True)
        return word_agg_df
    
    def split_essays_into_sentences(self, essay_df):
        essay_df['sent'] = essay_df['essay'].apply(lambda x: re.split('\\.|\\?|\\!',x))
        essay_df = essay_df.explode('sent')
        essay_df['sent'] = essay_df['sent'].apply(lambda x: x.replace('\n','').strip())
        # Number of characters in sentences
        essay_df['sent_len'] = essay_df['sent'].apply(lambda x: len(x))
        # Number of words in sentences
        essay_df['sent_word_count'] = essay_df['sent'].apply(lambda x: len(x.split(' ')))
        essay_df = essay_df[essay_df.sent_len!=0].reset_index(drop=True)
        return essay_df

    def compute_sentence_aggregations(self, sent_df):
        sent_agg_df = sent_df[['id','sent_len','sent_word_count']].groupby(['id']).agg(self.aggregations)
        sent_agg_df.columns = ['_'.join(x) for x in sent_agg_df.columns]
        sent_agg_df['id'] = sent_agg_df.index
        # New features: computing the # of sentences whose (character) length exceed sent_l
        for sent_l in [50, 60, 75, 100]:
            sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_df[sent_df['sent_len'] >= sent_l].groupby(['id']).count().iloc[:, 0]
            sent_agg_df[f'sent_len_ge_{sent_l}_count'] = sent_agg_df[f'sent_len_ge_{sent_l}_count'].fillna(0)
        sent_agg_df = sent_agg_df.reset_index(drop=True)
        return sent_agg_df

    def split_essays_into_paragraphs(self, essay_df):
        essay_df['paragraph'] = essay_df['essay'].apply(lambda x: x.split('\n'))
        essay_df = essay_df.explode('paragraph')
        # Number of characters in paragraphs
        essay_df['paragraph_len'] = essay_df['paragraph'].apply(lambda x: len(x)) 
        # Number of sentences in paragraphs
        essay_df['paragraph_sent_count'] = essay_df['paragraph'].apply(lambda x: len(x.split('\\.|\\?|\\!')))
        # Number of words in paragraphs
        essay_df['paragraph_word_count'] = essay_df['paragraph'].apply(lambda x: len(x.split(' ')))
        essay_df = essay_df[essay_df.paragraph_len!=0].reset_index(drop=True)
        return essay_df

    def compute_paragraph_aggregations(self, paragraph_df):
        paragraph_agg_df = paragraph_df[['id','paragraph_len', 'paragraph_sent_count', 'paragraph_word_count']].groupby(['id']).agg(self.aggregations)
        paragraph_agg_df.columns = ['_'.join(x) for x in paragraph_agg_df.columns]
        paragraph_agg_df['id'] = paragraph_agg_df.index
        paragraph_agg_df = paragraph_agg_df.reset_index(drop=True)
        return paragraph_agg_df
        
    def activity_counts(self, df):
        tmp_df = df.groupby('id').agg({'activity': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['activity'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.activities:
                di[k] = 0
            # make dictionary entry for "move from X to Y"
            di["move_to"] = 0
            
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
                else:
                    # we can do this because there are no missing values
                    di["move_to"] += v
            ret.append(di)
        
        ret = pd.DataFrame(ret)
        # using tfidf
        ret_tfidf = pd.DataFrame(ret)
        # returning counts as is
        ret_normal = pd.DataFrame(ret)
        
        tfidf_cols = [f'activity_{act}_tfidf_count' for act in ret.columns]
        normal_cols = [f'activity_{act}_normal_count' for act in ret.columns]
        
        ret_tfidf.columns = tfidf_cols
        ret_normal.columns = normal_cols
        
        '''
        Credit: https://www.kaggle.com/code/olyatsimboy/towards-tf-idf-in-logs-features
        '''
        cnts = ret_tfidf.sum(1)

        for col in tfidf_cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret_tfidf[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret_tfidf[col] = 1 + np.log(ret_tfidf[col] / cnts)
            ret_tfidf[col] *= idf
        
        ret_agg = pd.concat([ret_tfidf, ret_normal], axis=1)
        return ret_agg

    def event_counts(self, df, colname):
        tmp_df = df.groupby('id').agg({colname: list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df[colname].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.events:
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
            
        ret = pd.DataFrame(ret)
        # using tfidf
        ret_tfidf = pd.DataFrame(ret)
        # returning counts as is
        ret_normal = pd.DataFrame(ret)
        
        tfidf_cols = [f'{colname}_{event}_tfidf_count' for event in ret.columns]
        normal_cols = [f'{colname}_{event}_normal_count' for event in ret.columns]
        
        ret_tfidf.columns = tfidf_cols
        ret_normal.columns = normal_cols
        
        '''
        Credit: https://www.kaggle.com/code/olyatsimboy/towards-tf-idf-in-logs-features
        '''
        cnts = ret_tfidf.sum(1)

        for col in tfidf_cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret_tfidf[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret_tfidf[col] = 1 + np.log(ret_tfidf[col] / cnts)
            ret_tfidf[col] *= idf
        
        ret_agg = pd.concat([ret_tfidf, ret_normal], axis=1)
        return ret_agg

    def text_change_counts(self, df):
        tmp_df = df.groupby('id').agg({'text_change': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['text_change'].values):
            items = list(Counter(li).items())
            di = dict()
            for k in self.text_changes_dict.keys():
                di[k] = 0
            for item in items:
                k, v = item[0], item[1]
                if k in di:
                    di[k] = v
            ret.append(di)
            
        ret = pd.DataFrame(ret)
        # using tfidf
        ret_tfidf = pd.DataFrame(ret)
        # returning counts as is
        ret_normal = pd.DataFrame(ret)
        
        tfidf_cols = [f'text_change_{self.text_changes_dict[txt_change]}_tfidf_count' for txt_change in ret.columns]
        normal_cols = [f'text_change_{self.text_changes_dict[txt_change]}_normal_count' for txt_change in ret.columns]
        
        ret_tfidf.columns = tfidf_cols
        ret_normal.columns = normal_cols
        
        '''
        Credit: https://www.kaggle.com/code/olyatsimboy/towards-tf-idf-in-logs-features
        '''
        cnts = ret_tfidf.sum(1)

        for col in tfidf_cols:
            if col in self.idf.keys():
                idf = self.idf[col]
            else:
                idf = df.shape[0] / (ret_tfidf[col].sum() + 1)
                idf = np.log(idf)
                self.idf[col] = idf

            ret_tfidf[col] = 1 + np.log(ret_tfidf[col] / cnts)
            ret_tfidf[col] *= idf
        
        ret_agg = pd.concat([ret_tfidf, ret_normal], axis=1)
        return ret_agg
    
    def match_punctuations(self, df):
        tmp_df = df.groupby('id').agg({'down_event': list}).reset_index()
        ret = list()
        for li in tqdm(tmp_df['down_event'].values):
            cnt = 0
            items = list(Counter(li).items())
            for item in items:
                k, v = item[0], item[1]
                if k in self.punctuations:
                    cnt += v
            ret.append(cnt)
        ret = pd.DataFrame({'punct_cnt': ret})
        return ret

    # Credit: https://www.kaggle.com/code/abdullahmeda/enter-ing-the-timeseries-space-sec-3-new-aggs/notebook
    def make_space_features(self, df):
        df['up_time_lagged'] = df.groupby('id')['up_time'].shift(1).fillna(df['down_time'])
        df['time_diff'] = abs(df['down_time'] - df['up_time_lagged']) / 1000

        group = df.groupby('id')['time_diff']
        largest_lantency = group.max()
        smallest_lantency = group.min()
        median_lantency = group.median()
        initial_pause = df.groupby('id')['down_time'].first() / 1000
        pauses_half_sec = group.apply(lambda x: ((x > 0.5) & (x < 1)).sum())
        pauses_1_sec = group.apply(lambda x: ((x > 1) & (x < 1.5)).sum())
        pauses_1_half_sec = group.apply(lambda x: ((x > 1.5) & (x < 2)).sum())
        pauses_2_sec = group.apply(lambda x: ((x > 2) & (x < 3)).sum())
        pauses_3_sec = group.apply(lambda x: (x > 3).sum())
        
        result = pd.DataFrame({
            'id': df['id'].unique(),
            'largest_lantency': largest_lantency,
            'smallest_lantency': smallest_lantency,
            'median_lantency': median_lantency,
            'initial_pause': initial_pause,
            'pauses_half_sec': pauses_half_sec,
            'pauses_1_sec': pauses_1_sec,
            'pauses_1_half_sec': pauses_1_half_sec,
            'pauses_2_sec': pauses_2_sec,
            'pauses_3_sec': pauses_3_sec,
        }).reset_index(drop=True)
        return result
    
    def get_input_words(self, df):
        tmp_df = df[(~df['text_change'].str.contains('=>'))&(df['text_change'] != 'NoChange')].reset_index(drop=True)
        tmp_df = tmp_df.groupby('id').agg({'text_change': list}).reset_index()
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: ''.join(x))
        tmp_df['text_change'] = tmp_df['text_change'].apply(lambda x: re.findall(r'q+', x))
        tmp_df['input_word_count'] = tmp_df['text_change'].apply(len)
        tmp_df['input_word_length_mean'] = tmp_df['text_change'].apply(lambda x: np.mean([len(i) for i in x] if len(x) > 0 else 0))
        for percentile in self.percentiles:
            tmp_df[f'input_word_length_pct_{percentile}'] = tmp_df['text_change'].apply(lambda x: np.percentile([len(i) for i in x] if len(x) > 0 else 0, 
                                                                                                               percentile))
        tmp_df['input_word_length_max'] = tmp_df['text_change'].apply(lambda x: np.max([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df['input_word_length_std'] = tmp_df['text_change'].apply(lambda x: np.std([len(i) for i in x] if len(x) > 0 else 0))
        tmp_df.drop(['text_change'], axis=1, inplace=True)
        return tmp_df
    
    def make_feats(self, df: pd.DataFrame, save_essays_path: str):
        
        print("Starting to engineer features")
        
        # initialize features dataframe
        feats = pd.DataFrame({'id': df['id'].unique().tolist()})
        
        # get essay feats
        print("Getting essays")
        essay_df = self.essay_constructor.getEssays(df)
        essay_df.to_csv(save_essays_path, index=False)

        print("Getting essay aggregations data")
        essay_agg_df = self.get_essay_aggregations(essay_df)
        feats = feats.merge(essay_agg_df, on='id', how='left')

        print("Getting essay word aggregations data")
        word_df = self.split_essays_into_words(essay_df)
        word_agg_df = self.compute_word_aggregations(word_df)
        feats = feats.merge(word_agg_df, on='id', how='left')

        print("Getting essay sentence aggregations data")
        sent_df = self.split_essays_into_sentences(essay_df)
        sent_agg_df = self.compute_sentence_aggregations(sent_df)
        feats = feats.merge(sent_agg_df, on='id', how='left')

        print("Getting essay paragraph aggregations data")
        paragraph_df = self.split_essays_into_paragraphs(essay_df)
        paragraph_agg_df = self.compute_paragraph_aggregations(paragraph_df)
        feats = feats.merge(paragraph_agg_df, on='id', how='left')
        
        # engineer counts data
        print("Engineering activity counts data")
        tmp_df = self.activity_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering event counts data")
        tmp_df = self.event_counts(df, 'down_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        tmp_df = self.event_counts(df, 'up_event')
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering text change counts data")
        tmp_df = self.text_change_counts(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        print("Engineering punctuation counts data")
        tmp_df = self.match_punctuations(df)
        feats = pd.concat([feats, tmp_df], axis=1)
        
        # space features
        print("Engineering space-related data")
        tmp_df = self.make_space_features(df)
        feats = feats.merge(tmp_df, on='id', how='left')
        
        # get shifted features
        # time shift
        print("Engineering time data")
        for gap in self.gaps:
            print(f"> for gap {gap}")
            df[f'up_time_shift{gap}'] = df.groupby('id')['up_time'].shift(gap)
            df[f'action_time_gap{gap}'] = df['down_time'] - df[f'up_time_shift{gap}']
        df.drop(columns=[f'up_time_shift{gap}' for gap in self.gaps], inplace=True)
        
        # cursor position shift
        print("Engineering cursor position data - gaps")
        for gap in self.gaps: 
            print(f"> for gap {gap}")
            df[f'cursor_position_shift{gap}'] = df.groupby('id')['cursor_position'].shift(gap)
            df[f'cursor_position_change{gap}'] = df['cursor_position'] - df[f'cursor_position_shift{gap}']
            df[f'cursor_position_abs_change{gap}'] = np.abs(df[f'cursor_position_change{gap}'])
        df.drop(columns=[f'cursor_position_shift{gap}' for gap in self.gaps], inplace=True)
        
        # word count shift
        print("Engineering word count data - gaps")
        for gap in self.gaps: 
            print(f"> for gap {gap}")
            df[f'word_count_shift{gap}'] = df.groupby('id')['word_count'].shift(gap)
            df[f'word_count_change{gap}'] = df['word_count'] - df[f'word_count_shift{gap}']
            df[f'word_count_abs_change{gap}'] = np.abs(df[f'word_count_change{gap}'])
        df.drop(columns=[f'word_count_shift{gap}' for gap in self.gaps], inplace=True)
        
        # get aggregate statistical features
        print("Engineering statistical summaries for features")
        # [(feature name, [ stat summaries to add ])]
        percentiles_cols = [percentile(n) for n in self.percentiles]
        feats_stat = [
            ('event_id', ['max']),
            ('up_time', ['first', 'last', 'max']),
            ('down_time', ['first', 'last', 'max']),
            ('action_time', ['max', 'mean', 'std', 'sem', 'skew', pd.DataFrame.kurt ] + self.percentiles_cols),
            ('activity', ['nunique']),
            ('down_event', [ 'nunique']),
            ('up_event', [ 'nunique']),
            ('text_change', [ 'nunique']),
            ('cursor_position', ['max']),
            ('word_count', ['max'])] 

        for gap in self.gaps:
            feats_stat.extend([
                (f'action_time_gap{gap}', ['first','last', 'max', 'min', 'mean', 'std', 'sem', 'skew', pd.DataFrame.kurt]+ percentiles_cols),
                (f'cursor_position_change{gap}', ['first','last','max', 'mean', 'std','sem', 'skew', pd.DataFrame.kurt]),
                (f'word_count_change{gap}', ['max', 'mean', 'std', 'sum', 'sem', 'skew', pd.DataFrame.kurt] + percentiles_cols),
            ])
        
        pbar = tqdm(feats_stat)
        for item in pbar:
            colname, methods = item[0], item[1]
            for method in methods:
                pbar.set_postfix()
                if isinstance(method, str):
                    method_name = method
                else:
                    method_name = method.__name__
                    
                pbar.set_postfix(column=colname, method=method_name)
                tmp_df = df.groupby(['id']).agg({colname: method}).reset_index().rename(columns={colname: f'{colname}_{method_name}'})
                feats = feats.merge(tmp_df, on='id', how='left') 

        print("Engineering input words data")
        tmp_df = self.get_input_words(df)
        feats = pd.merge(feats, tmp_df, on='id', how='left')
        
        # compare feats
        print("Engineering ratios data")
        feats['word_time_ratio'] = feats['word_count_max'] / feats['up_time_max']
        feats['word_event_ratio'] = feats['word_count_max'] / feats['event_id_max']
        feats['event_time_ratio'] = feats['event_id_max']  / feats['up_time_max']
        feats['idle_time_ratio'] = feats['action_time_gap1_mean'] / feats['up_time_max']
        
        print("Done!")
        return feats

## Run Preprocessor

In [22]:
if CONFIG.use_pre_fe_data:
    print("-"*25)
    print("Loading pre-engineered features for training data")
    print("-"*25)
    train_feats = pd.read_csv(CONFIG.pre_fe_data_filepath)
else:
    preprocessor_train = Preprocessor(seed=CONFIG.seed)
    print("-"*25)
    print("Engineering features for training data")
    print("-"*25)
    train_feats = preprocessor_train.make_feats(train_logs, save_essays_path = 'train_essays.csv')
    del preprocessor_train
    gc.collect()
    
print()
print("-"*25)
print("Engineering features for test data")
print("-"*25)
preprocessor_test = Preprocessor(seed=CONFIG.seed)
test_feats = preprocessor_test.make_feats(test_logs, save_essays_path='test_essays.csv')
del preprocessor_test
gc.collect()

-------------------------
Loading pre-engineered features for training data
-------------------------

-------------------------
Engineering features for test data
-------------------------
Starting to engineer features
Getting essays


  0%|          | 0/3 [00:00<?, ?it/s]

Getting essay aggregations data
Getting essay word aggregations data
Getting essay sentence aggregations data
Getting essay paragraph aggregations data
Engineering activity counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering event counts data


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Engineering text change counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering punctuation counts data


  0%|          | 0/3 [00:00<?, ?it/s]

Engineering space-related data
Engineering time data
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 70
> for gap 100
Engineering cursor position data - gaps
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 70
> for gap 100
Engineering word count data - gaps
> for gap 1
> for gap 2
> for gap 3
> for gap 5
> for gap 10
> for gap 20
> for gap 50
> for gap 70
> for gap 100
Engineering statistical summaries for features


  0%|          | 0/37 [00:00<?, ?it/s]

Engineering input words data
Engineering ratios data
Done!


152

In [23]:
train_feats.to_csv("feat_eng_train_feats.csv", index=False)

In [24]:
print(f"Shape of training data : {train_feats.shape}" )
print(f"Shape of test data : {test_feats.shape}" )

Shape of training data : (2471, 591)
Shape of test data : (3, 591)


In [25]:
assert train_feats.shape[1] == test_feats.shape[1], "Train and test data must have same number of features."

In [26]:
train_feats.head()

Unnamed: 0,id,essay_len,word_len_mean,word_len_std,word_len_min,word_len_max,word_len_first,word_len_last,word_len_sem,word_len_q1,word_len_median,word_len_q3,word_len_skew,word_len_kurt,word_len_sum,word_len_ge_5_count,word_len_ge_6_count,word_len_ge_7_count,word_len_ge_8_count,word_len_ge_9_count,word_len_ge_10_count,word_len_ge_11_count,word_len_ge_12_count,sent_len_mean,sent_len_std,sent_len_min,sent_len_max,sent_len_first,sent_len_last,sent_len_sem,sent_len_q1,sent_len_median,sent_len_q3,sent_len_skew,sent_len_kurt,sent_len_sum,sent_word_count_mean,sent_word_count_std,sent_word_count_min,sent_word_count_max,sent_word_count_first,sent_word_count_last,sent_word_count_sem,sent_word_count_q1,sent_word_count_median,sent_word_count_q3,sent_word_count_skew,sent_word_count_kurt,sent_word_count_sum,sent_len_ge_50_count,sent_len_ge_60_count,sent_len_ge_75_count,sent_len_ge_100_count,paragraph_len_mean,paragraph_len_std,paragraph_len_min,paragraph_len_max,paragraph_len_first,paragraph_len_last,paragraph_len_sem,paragraph_len_q1,paragraph_len_median,paragraph_len_q3,paragraph_len_skew,paragraph_len_kurt,paragraph_len_sum,paragraph_sent_count_mean,paragraph_sent_count_std,paragraph_sent_count_min,paragraph_sent_count_max,paragraph_sent_count_first,paragraph_sent_count_last,paragraph_sent_count_sem,paragraph_sent_count_q1,paragraph_sent_count_median,paragraph_sent_count_q3,paragraph_sent_count_skew,paragraph_sent_count_kurt,paragraph_sent_count_sum,paragraph_word_count_mean,paragraph_word_count_std,paragraph_word_count_min,paragraph_word_count_max,paragraph_word_count_first,paragraph_word_count_last,paragraph_word_count_sem,paragraph_word_count_q1,paragraph_word_count_median,paragraph_word_count_q3,paragraph_word_count_skew,paragraph_word_count_kurt,paragraph_word_count_sum,activity_Input_tfidf_count,activity_Remove/Cut_tfidf_count,activity_Nonproduction_tfidf_count,activity_Replace_tfidf_count,activity_Paste_tfidf_count,activity_move_to_tfidf_count,activity_Input_normal_count,activity_Remove/Cut_normal_count,activity_Nonproduction_normal_count,activity_Replace_normal_count,activity_Paste_normal_count,activity_move_to_normal_count,down_event_q_tfidf_count,down_event_Space_tfidf_count,down_event_Backspace_tfidf_count,down_event_Shift_tfidf_count,down_event_ArrowRight_tfidf_count,down_event_Leftclick_tfidf_count,down_event_ArrowLeft_tfidf_count,down_event_._tfidf_count,"down_event_,_tfidf_count",down_event_ArrowDown_tfidf_count,down_event_ArrowUp_tfidf_count,down_event_Enter_tfidf_count,down_event_CapsLock_tfidf_count,down_event_'_tfidf_count,down_event_Delete_tfidf_count,down_event_Unidentified_tfidf_count,down_event_q_normal_count,down_event_Space_normal_count,down_event_Backspace_normal_count,down_event_Shift_normal_count,down_event_ArrowRight_normal_count,down_event_Leftclick_normal_count,down_event_ArrowLeft_normal_count,down_event_._normal_count,"down_event_,_normal_count",down_event_ArrowDown_normal_count,down_event_ArrowUp_normal_count,down_event_Enter_normal_count,down_event_CapsLock_normal_count,down_event_'_normal_count,down_event_Delete_normal_count,down_event_Unidentified_normal_count,up_event_q_tfidf_count,up_event_Space_tfidf_count,up_event_Backspace_tfidf_count,up_event_Shift_tfidf_count,up_event_ArrowRight_tfidf_count,up_event_Leftclick_tfidf_count,up_event_ArrowLeft_tfidf_count,up_event_._tfidf_count,"up_event_,_tfidf_count",up_event_ArrowDown_tfidf_count,up_event_ArrowUp_tfidf_count,up_event_Enter_tfidf_count,up_event_CapsLock_tfidf_count,up_event_'_tfidf_count,up_event_Delete_tfidf_count,up_event_Unidentified_tfidf_count,up_event_q_normal_count,up_event_Space_normal_count,up_event_Backspace_normal_count,up_event_Shift_normal_count,up_event_ArrowRight_normal_count,up_event_Leftclick_normal_count,up_event_ArrowLeft_normal_count,up_event_._normal_count,"up_event_,_normal_count",up_event_ArrowDown_normal_count,up_event_ArrowUp_normal_count,up_event_Enter_normal_count,up_event_CapsLock_normal_count,up_event_'_normal_count,up_event_Delete_normal_count,up_event_Unidentified_normal_count,text_change_q_tfidf_count,text_change_space_tfidf_count,text_change_NoChange_tfidf_count,text_change_full_stop_tfidf_count,text_change_comma_tfidf_count,text_change_newline_tfidf_count,text_change_single_quote_tfidf_count,text_change_double_quote_tfidf_count,text_change_dash_tfidf_count,text_change_question_mark_tfidf_count,text_change_semicolon_tfidf_count,text_change_equals_tfidf_count,text_change_slash_tfidf_count,text_change_double_backslash_tfidf_count,text_change_colon_tfidf_count,text_change_q_normal_count,text_change_space_normal_count,text_change_NoChange_normal_count,text_change_full_stop_normal_count,text_change_comma_normal_count,text_change_newline_normal_count,text_change_single_quote_normal_count,text_change_double_quote_normal_count,text_change_dash_normal_count,text_change_question_mark_normal_count,text_change_semicolon_normal_count,text_change_equals_normal_count,text_change_slash_normal_count,text_change_double_backslash_normal_count,text_change_colon_normal_count,punct_cnt,largest_lantency,smallest_lantency,median_lantency,initial_pause,pauses_half_sec,pauses_1_sec,pauses_1_half_sec,pauses_2_sec,pauses_3_sec,event_id_max,up_time_first,up_time_last,up_time_max,down_time_first,down_time_last,down_time_max,action_time_max,action_time_mean,action_time_std,action_time_sem,action_time_skew,action_time_kurt,action_time_pct_05,action_time_pct_10,action_time_pct_25,action_time_pct_50,action_time_pct_75,action_time_pct_90,action_time_pct_95,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_max,word_count_max,action_time_gap1_first,action_time_gap1_last,action_time_gap1_max,action_time_gap1_min,action_time_gap1_mean,action_time_gap1_std,action_time_gap1_sem,action_time_gap1_skew,action_time_gap1_kurt,action_time_gap1_pct_05,action_time_gap1_pct_10,action_time_gap1_pct_25,action_time_gap1_pct_50,action_time_gap1_pct_75,action_time_gap1_pct_90,action_time_gap1_pct_95,cursor_position_change1_first,cursor_position_change1_last,cursor_position_change1_max,cursor_position_change1_mean,cursor_position_change1_std,cursor_position_change1_sem,cursor_position_change1_skew,cursor_position_change1_kurt,word_count_change1_max,word_count_change1_mean,word_count_change1_std,word_count_change1_sum,word_count_change1_sem,word_count_change1_skew,word_count_change1_kurt,word_count_change1_pct_05,word_count_change1_pct_10,word_count_change1_pct_25,word_count_change1_pct_50,word_count_change1_pct_75,word_count_change1_pct_90,word_count_change1_pct_95,action_time_gap2_first,action_time_gap2_last,action_time_gap2_max,action_time_gap2_min,action_time_gap2_mean,action_time_gap2_std,action_time_gap2_sem,action_time_gap2_skew,action_time_gap2_kurt,action_time_gap2_pct_05,action_time_gap2_pct_10,action_time_gap2_pct_25,action_time_gap2_pct_50,action_time_gap2_pct_75,action_time_gap2_pct_90,action_time_gap2_pct_95,cursor_position_change2_first,cursor_position_change2_last,cursor_position_change2_max,cursor_position_change2_mean,cursor_position_change2_std,cursor_position_change2_sem,cursor_position_change2_skew,cursor_position_change2_kurt,word_count_change2_max,word_count_change2_mean,word_count_change2_std,word_count_change2_sum,word_count_change2_sem,word_count_change2_skew,word_count_change2_kurt,word_count_change2_pct_05,word_count_change2_pct_10,word_count_change2_pct_25,word_count_change2_pct_50,word_count_change2_pct_75,word_count_change2_pct_90,word_count_change2_pct_95,action_time_gap3_first,action_time_gap3_last,action_time_gap3_max,action_time_gap3_min,action_time_gap3_mean,action_time_gap3_std,action_time_gap3_sem,action_time_gap3_skew,action_time_gap3_kurt,action_time_gap3_pct_05,action_time_gap3_pct_10,action_time_gap3_pct_25,action_time_gap3_pct_50,action_time_gap3_pct_75,action_time_gap3_pct_90,action_time_gap3_pct_95,cursor_position_change3_first,cursor_position_change3_last,cursor_position_change3_max,cursor_position_change3_mean,cursor_position_change3_std,cursor_position_change3_sem,cursor_position_change3_skew,cursor_position_change3_kurt,word_count_change3_max,word_count_change3_mean,word_count_change3_std,word_count_change3_sum,word_count_change3_sem,word_count_change3_skew,word_count_change3_kurt,word_count_change3_pct_05,word_count_change3_pct_10,word_count_change3_pct_25,word_count_change3_pct_50,word_count_change3_pct_75,word_count_change3_pct_90,word_count_change3_pct_95,action_time_gap5_first,action_time_gap5_last,action_time_gap5_max,action_time_gap5_min,action_time_gap5_mean,action_time_gap5_std,action_time_gap5_sem,action_time_gap5_skew,action_time_gap5_kurt,action_time_gap5_pct_05,action_time_gap5_pct_10,action_time_gap5_pct_25,action_time_gap5_pct_50,action_time_gap5_pct_75,action_time_gap5_pct_90,action_time_gap5_pct_95,cursor_position_change5_first,cursor_position_change5_last,cursor_position_change5_max,cursor_position_change5_mean,cursor_position_change5_std,cursor_position_change5_sem,cursor_position_change5_skew,cursor_position_change5_kurt,word_count_change5_max,word_count_change5_mean,word_count_change5_std,word_count_change5_sum,word_count_change5_sem,word_count_change5_skew,word_count_change5_kurt,word_count_change5_pct_05,word_count_change5_pct_10,word_count_change5_pct_25,word_count_change5_pct_50,word_count_change5_pct_75,word_count_change5_pct_90,word_count_change5_pct_95,action_time_gap10_first,action_time_gap10_last,action_time_gap10_max,action_time_gap10_min,action_time_gap10_mean,action_time_gap10_std,action_time_gap10_sem,action_time_gap10_skew,action_time_gap10_kurt,action_time_gap10_pct_05,action_time_gap10_pct_10,action_time_gap10_pct_25,action_time_gap10_pct_50,action_time_gap10_pct_75,action_time_gap10_pct_90,action_time_gap10_pct_95,cursor_position_change10_first,cursor_position_change10_last,cursor_position_change10_max,cursor_position_change10_mean,cursor_position_change10_std,cursor_position_change10_sem,cursor_position_change10_skew,cursor_position_change10_kurt,word_count_change10_max,word_count_change10_mean,word_count_change10_std,word_count_change10_sum,word_count_change10_sem,word_count_change10_skew,word_count_change10_kurt,word_count_change10_pct_05,word_count_change10_pct_10,word_count_change10_pct_25,word_count_change10_pct_50,word_count_change10_pct_75,word_count_change10_pct_90,word_count_change10_pct_95,action_time_gap20_first,action_time_gap20_last,action_time_gap20_max,action_time_gap20_min,action_time_gap20_mean,action_time_gap20_std,action_time_gap20_sem,action_time_gap20_skew,action_time_gap20_kurt,action_time_gap20_pct_05,action_time_gap20_pct_10,action_time_gap20_pct_25,action_time_gap20_pct_50,action_time_gap20_pct_75,action_time_gap20_pct_90,action_time_gap20_pct_95,cursor_position_change20_first,cursor_position_change20_last,cursor_position_change20_max,cursor_position_change20_mean,cursor_position_change20_std,cursor_position_change20_sem,cursor_position_change20_skew,cursor_position_change20_kurt,word_count_change20_max,word_count_change20_mean,word_count_change20_std,word_count_change20_sum,word_count_change20_sem,word_count_change20_skew,word_count_change20_kurt,word_count_change20_pct_05,word_count_change20_pct_10,word_count_change20_pct_25,word_count_change20_pct_50,word_count_change20_pct_75,word_count_change20_pct_90,word_count_change20_pct_95,action_time_gap50_first,action_time_gap50_last,action_time_gap50_max,action_time_gap50_min,action_time_gap50_mean,action_time_gap50_std,action_time_gap50_sem,action_time_gap50_skew,action_time_gap50_kurt,action_time_gap50_pct_05,action_time_gap50_pct_10,action_time_gap50_pct_25,action_time_gap50_pct_50,action_time_gap50_pct_75,action_time_gap50_pct_90,action_time_gap50_pct_95,cursor_position_change50_first,cursor_position_change50_last,cursor_position_change50_max,cursor_position_change50_mean,cursor_position_change50_std,cursor_position_change50_sem,cursor_position_change50_skew,cursor_position_change50_kurt,word_count_change50_max,word_count_change50_mean,word_count_change50_std,word_count_change50_sum,word_count_change50_sem,word_count_change50_skew,word_count_change50_kurt,word_count_change50_pct_05,word_count_change50_pct_10,word_count_change50_pct_25,word_count_change50_pct_50,word_count_change50_pct_75,word_count_change50_pct_90,word_count_change50_pct_95,action_time_gap70_first,action_time_gap70_last,action_time_gap70_max,action_time_gap70_min,action_time_gap70_mean,action_time_gap70_std,action_time_gap70_sem,action_time_gap70_skew,action_time_gap70_kurt,action_time_gap70_pct_05,action_time_gap70_pct_10,action_time_gap70_pct_25,action_time_gap70_pct_50,action_time_gap70_pct_75,action_time_gap70_pct_90,action_time_gap70_pct_95,cursor_position_change70_first,cursor_position_change70_last,cursor_position_change70_max,cursor_position_change70_mean,cursor_position_change70_std,cursor_position_change70_sem,cursor_position_change70_skew,cursor_position_change70_kurt,word_count_change70_max,word_count_change70_mean,word_count_change70_std,word_count_change70_sum,word_count_change70_sem,word_count_change70_skew,word_count_change70_kurt,word_count_change70_pct_05,word_count_change70_pct_10,word_count_change70_pct_25,word_count_change70_pct_50,word_count_change70_pct_75,word_count_change70_pct_90,word_count_change70_pct_95,action_time_gap100_first,action_time_gap100_last,action_time_gap100_max,action_time_gap100_min,action_time_gap100_mean,action_time_gap100_std,action_time_gap100_sem,action_time_gap100_skew,action_time_gap100_kurt,action_time_gap100_pct_05,action_time_gap100_pct_10,action_time_gap100_pct_25,action_time_gap100_pct_50,action_time_gap100_pct_75,action_time_gap100_pct_90,action_time_gap100_pct_95,cursor_position_change100_first,cursor_position_change100_last,cursor_position_change100_max,cursor_position_change100_mean,cursor_position_change100_std,cursor_position_change100_sem,cursor_position_change100_skew,cursor_position_change100_kurt,word_count_change100_max,word_count_change100_mean,word_count_change100_std,word_count_change100_sum,word_count_change100_sem,word_count_change100_skew,word_count_change100_kurt,word_count_change100_pct_05,word_count_change100_pct_10,word_count_change100_pct_25,word_count_change100_pct_50,word_count_change100_pct_75,word_count_change100_pct_90,word_count_change100_pct_95,input_word_count,input_word_length_mean,input_word_length_pct_5,input_word_length_pct_10,input_word_length_pct_25,input_word_length_pct_50,input_word_length_pct_75,input_word_length_pct_90,input_word_length_pct_95,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,001519c8,1528,4.859375,2.515004,1,11,9,9,0.157188,3.0,4.0,7.0,0.670129,-0.558174,1244,122,89,66,46,32,12.0,6.0,0.0,106.142857,41.12805,31,196,31,89,10.991934,75.5,119.5,126.0,0.131863,0.843849,1486,18.285714,6.497675,6,29,6,16,1.736577,12.25,21.0,22.0,-0.506007,-0.526754,256,13,12,10.0,9.0,508.0,134.208793,390,654,390,480,77.485483,435.0,480.0,567.0,0.897971,,1524,1.0,0.0,1,1,1,1,0.0,1.0,1.0,1.0,0.0,,3,89.666667,20.744477,71,112,71,86,11.976829,78.5,86.0,99.0,0.770543,,269,0.169198,-1.756541,-5.106812,-36.970775,-inf,-69.517747,2010,417,120,7,0,3,0.243959,-1.879485,-1.760799,-11.44944,-26.077252,-10.516642,-30.089037,-18.815452,-22.637968,-inf,-inf,-32.294049,-inf,-38.068379,-inf,-inf,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0,0.244012,-1.879485,-1.760799,-11.44944,-26.077252,-10.516642,-30.089037,-18.815452,-22.637968,-inf,-inf,-32.294049,-inf,-38.068379,-inf,-inf,1619,357,417,27,2,92,2,21,12,0,0,4,0,3,0,0,0.229874,-1.384763,-5.098068,-16.922891,-21.158914,-31.132346,-33.793537,-inf,-inf,-inf,-54.538613,-inf,-inf,-inf,-inf,1940,436,120,28,14,4,5,0,0,0,1,0,0,0,0,37,154.136,0.0,0.062,4.526,114,51,30,21,103,2557,4557,1801969,1801969,4526,1801877,1801877,2259,116.246774,91.797374,1.815369,12.546928,218.650155,34.0,64.6,90.0,112.0,135.0,155.0,169.0,7,12,12,17,1539,256,1.0,18892.0,154136.0,-142.0,586.932707,4294.022274,84.934448,24.491833,768.760112,-70.25,-52.0,-8.0,51.0,142.5,707.0,1946.5,0.0,492.0,591.0,0.409233,43.378153,0.858007,-18.285143,608.174895,1.0,0.099765,0.406195,255.0,0.008034,0.632905,2.96775,0.0,0.0,0.0,0.0,0.0,1.0,1.0,102014.0,19736.0,162192.0,-79.0,1283.219961,6197.624733,122.61111,17.479471,392.240563,33.0,60.0,131.0,234.0,568.5,2121.0,5088.7,0.0,493.0,597.0,0.626223,53.91093,1.06655,-15.131542,375.742784,1.0,0.199609,0.51531,510.0,0.010195,0.136183,0.602218,0.0,0.0,0.0,0.0,0.5,1.0,1.0,102129.0,19886.0,165317.0,46.0,1939.779561,7370.623958,145.84576,14.865277,287.859583,135.65,190.3,303.0,481.0,1015.75,3815.4,7287.35,1.0,492.0,591.0,0.842991,63.503114,1.256564,-13.329804,271.593931,2.0,0.29953,0.594352,765.0,0.011761,-0.291329,-0.033739,-1.0,0.0,0.0,0.0,1.0,1.0,1.0,102739.0,20179.0,165599.0,110.0,3253.976097,9299.583212,184.087013,11.771157,183.305715,402.2,494.0,684.75,1035.0,2401.5,7177.9,10793.35,3.0,490.0,596.0,1.277038,79.452964,1.572786,-10.976369,173.259123,2.0,0.498824,0.757038,1273.0,0.014986,-0.402917,0.366909,-1.0,0.0,0.0,1.0,1.0,1.0,2.0,103451.0,27377.0,168452.0,276.0,6538.979976,13017.275984,257.932246,8.286051,92.182467,1240.8,1412.0,1809.0,2725.0,6341.0,12871.0,27178.6,8.0,496.0,652.0,2.354927,108.44187,2.148733,-7.901244,90.572081,3.0,0.99843,1.084779,2543.0,0.021494,-0.639853,0.335147,-1.0,0.0,0.0,1.0,2.0,2.0,2.0,106188.0,40876.0,178695.0,1979.0,13109.624359,18607.603986,369.428351,5.771598,44.79899,3124.6,3513.6,4361.0,7554.0,13513.0,30417.4,38731.0,16.0,512.0,694.0,4.466693,154.938867,3.076098,-5.288089,42.407429,5.0,1.997635,1.596771,5068.0,0.031702,-0.555623,-0.076156,-1.0,0.0,1.0,2.0,3.0,4.0,4.0,110354.0,73131.0,221383.0,8315.0,32731.639809,31717.064031,633.455063,3.848053,18.609379,10369.6,11521.4,14629.5,22988.0,43677.0,55275.4,78287.5,44.0,576.0,966.0,10.034304,242.436498,4.841956,-2.569592,14.013855,11.0,5.012764,2.70641,12567.0,0.054053,-0.212476,-0.789717,1.0,1.0,3.0,5.0,7.0,8.0,9.0,147471.0,251223.0,260749.0,12238.0,44830.490149,37046.847871,742.870939,3.52098,15.717376,16121.5,17156.2,24192.5,34251.0,53257.5,71069.2,101571.9,54.0,-401.0,1150.0,19.911942,258.591505,5.18533,-1.785836,11.677902,14.0,7.014877,3.257052,17446.0,0.065311,-0.190569,-0.83662,2.0,3.0,5.0,7.0,10.0,11.0,12.0,156792.0,286636.0,321201.0,21001.0,62090.676435,43489.01744,877.358357,3.236761,13.668688,24191.4,26825.6,36312.0,55594.0,69363.0,101994.4,131117.6,66.0,-254.0,1243.0,35.923891,254.052084,5.125311,-1.138021,13.291906,19.0,10.060643,3.980918,24719.0,0.080312,-0.228903,-0.788893,3.0,5.0,7.0,10.0,14.0,15.0,16.0,366,5.325137,2.0,2.0,3.0,4.0,7.0,10.0,12.0,20,3.487804,0.000142,0.100117,0.001419,0.000326
1,0022f953,1675,4.040248,2.243332,1,15,4,3,0.124822,3.0,4.0,5.0,1.761347,4.523647,1305,102,60,33,20,14,13.0,8.0,5.0,107.666667,64.713287,19,226,19,143,16.708899,56.5,92.0,149.5,0.508048,-0.726829,1615,21.666667,12.66416,3,45,3,30,3.269872,12.0,20.0,31.0,0.391857,-0.935036,325,11,11,9.0,7.0,278.166667,98.554384,176,462,240,284,40.234659,228.75,261.0,283.5,1.548347,3.107505,1669,1.0,0.0,1,1,1,1,0.0,1.0,1.0,1.0,0.0,0.0,6,59.166667,20.370731,37,96,53,60,8.316316,47.75,56.5,62.25,1.299614,2.342703,355,0.170232,-2.687793,-3.145141,-51.340565,-64.975368,-inf,1938,260,254,1,1,0,0.227716,-1.612006,-2.682327,-7.17414,-12.590134,-12.550744,-14.218078,-20.248839,-19.489984,-31.962529,-34.791406,-29.61868,-inf,-37.758039,-inf,-inf,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0,0.227766,-1.612006,-2.682327,-7.17414,-12.590134,-12.550744,-14.218078,-20.248839,-19.489984,-31.962529,-34.791406,-29.61868,-inf,-37.758039,-inf,-inf,1490,391,260,97,46,56,49,15,21,3,2,6,0,3,0,0,0.199905,-1.331957,-3.143119,-18.867477,-18.252833,-27.720478,-34.986396,-37.585086,-37.768031,-44.40242,-inf,-inf,-inf,-inf,-inf,1698,432,254,18,24,7,4,6,6,3,0,0,0,0,0,53,145.899,0.0,0.061,30.623,141,37,13,19,61,2454,30853,1788969,1788969,30623,1788842,1788842,1758,112.221271,55.431189,1.118966,10.902216,318.756986,31.0,62.3,92.0,115.0,136.0,160.0,179.0,5,17,17,12,1676,323,1696.0,449.0,145899.0,-166.0,604.547493,4897.303641,98.879954,19.60882,479.056954,-53.0,-34.8,0.0,52.0,132.0,547.8,1083.8,0.0,87.0,1336.0,0.619242,85.35033,1.723282,-4.433082,175.538653,1.0,0.130453,0.391725,320.0,0.007909,1.136049,2.058785,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1853.0,115060.0,155308.0,-77.0,1320.880506,7175.684563,144.911585,13.687505,232.234765,49.0,78.0,133.75,240.0,498.25,1174.4,3157.6,1.0,131.0,1337.0,1.203507,110.059461,2.22263,-3.953338,106.91466,1.0,0.261011,0.500898,640.0,0.010116,0.373224,-0.409225,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1973.0,115062.0,158094.0,53.0,1990.973072,8855.33502,178.868284,11.241776,155.053206,109.0,184.0,282.0,443.0,884.5,1977.0,5775.0,2.0,132.0,1336.0,1.76989,122.537045,2.475117,-4.18148,83.325999,2.0,0.391269,0.567764,959.0,0.011468,-0.109043,-0.678268,0.0,0.0,0.0,0.0,1.0,1.0,1.0,2157.0,115128.0,159773.0,124.0,3332.646795,11835.513938,239.162304,8.844123,93.621343,309.0,443.8,609.0,938.0,1608.0,4097.2,13640.6,4.0,134.0,1222.0,2.901592,133.338415,2.694393,-4.251228,61.9311,2.0,0.652103,0.726931,1597.0,0.014689,-0.317981,0.38534,0.0,0.0,0.0,1.0,1.0,1.0,2.0,3333.0,115301.0,219032.0,283.0,6695.138707,18659.101143,377.433216,7.076348,59.560797,958.15,1194.0,1589.0,2199.5,3627.75,11570.2,27413.8,8.0,139.0,1587.0,5.724632,174.783718,3.535496,-2.430616,35.201048,4.0,1.304828,1.120257,3189.0,0.02266,-0.700494,0.568242,0.0,0.0,1.0,2.0,2.0,3.0,3.0,6052.0,115634.0,253124.0,616.0,13451.898521,30098.689949,610.080728,5.640373,36.229759,2581.6,2908.1,3628.25,4967.0,8347.25,31326.6,51164.75,18.0,149.0,1596.0,11.347987,236.771582,4.799205,-1.094288,17.886358,6.0,2.612983,1.746046,6360.0,0.035391,-0.78458,0.046971,0.0,0.0,2.0,3.0,4.0,4.0,5.0,19037.0,121513.0,502717.0,1643.0,33936.604825,62571.107753,1276.164361,5.022745,28.436454,8171.05,8811.0,10753.0,14960.0,26505.0,64118.4,94111.95,21.0,161.0,1305.0,28.244592,318.025206,6.486259,-0.912221,6.677677,13.0,6.601913,3.060088,15871.0,0.062412,-0.604034,-0.380291,0.0,2.0,5.0,7.0,9.0,10.0,11.0,22917.0,123705.0,554442.0,4493.0,47759.088087,79768.849484,1633.729694,4.473691,21.759525,12227.6,13453.7,16215.0,22368.5,47204.0,84710.0,133379.45,38.0,143.0,1285.0,39.704698,346.132663,7.089073,-0.677646,4.493329,17.0,9.298658,3.811293,22168.0,0.078058,-0.762684,0.168421,1.15,4.0,7.0,10.0,12.0,13.0,14.0,35189.0,177594.0,574521.0,11720.0,68495.383602,101911.770978,2100.492424,3.720808,13.973044,18688.8,20903.9,25557.0,31816.5,66943.75,137407.0,281858.4,59.0,45.0,1283.0,57.782923,357.528957,7.36899,-0.366026,3.043631,23.0,13.378505,4.799762,31493.0,0.098927,-0.897582,0.516612,4.0,6.0,11.0,14.0,17.0,19.0,20.0,385,4.41039,2.0,2.0,3.0,4.0,5.0,7.0,10.0,33,3.199496,0.000181,0.131622,0.001372,0.000338
2,0042269b,2587,5.279412,2.798801,1,13,11,3,0.138561,3.0,5.0,8.0,0.621751,-0.561844,2154,210,160,133,103,66,31.0,25.0,6.0,133.842105,33.480115,73,189,139,161,7.680865,108.0,139.0,161.0,-0.220844,-0.756307,2543,21.473684,5.263801,12,29,21,26,1.207599,17.5,21.0,26.5,-0.24256,-1.171619,408,19,19,18.0,17.0,429.5,101.087586,296,568,491,296,41.268834,356.75,444.5,483.5,-0.103767,-0.970266,2577,1.0,0.0,1,1,1,1,0.0,1.0,1.0,1.0,0.0,0.0,6,68.333333,16.966634,45,88,79,45,6.926599,55.5,73.5,78.75,-0.502908,-1.536764,410,0.186582,-2.683889,-5.363753,-40.598648,-inf,-inf,3515,439,175,7,0,0,0.290853,-1.964918,-2.688546,-11.810976,-23.454807,-11.158253,-inf,-21.190271,-21.751644,-inf,-inf,-26.573969,-inf,-inf,-inf,-inf,2904,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0,0.290687,-1.962567,-2.685922,-11.80707,-23.449673,-11.152772,-inf,-21.184275,-21.745356,-inf,-inf,-26.566804,-inf,-inf,-inf,-inf,2899,552,439,39,6,129,0,21,23,0,0,17,0,0,0,0,0.241289,-1.634461,-5.35775,-20.194379,-20.469189,-23.902627,-inf,-49.723944,-55.188262,-inf,-inf,-48.748827,-inf,-inf,-inf,3257,615,175,23,26,23,0,2,1,0,0,4,0,0,0,47,153.886,0.0,0.04,4.441,83,46,25,25,52,4136,4540,1771669,1771669,4441,1771219,1771219,3005,101.837766,82.383766,1.281007,16.611183,447.265001,49.0,57.0,74.0,94.0,119.0,140.0,155.0,4,13,18,19,2291,404,16736.0,1433.0,153886.0,-250.0,325.520435,3937.359025,61.230423,24.683659,754.926923,-70.0,-57.0,-32.0,0.0,65.0,196.0,595.7,0.0,0.0,1826.0,0.118742,72.082931,1.120972,-4.452635,411.033698,1.0,0.097703,0.658855,404.0,0.010246,-21.811366,845.370087,0.0,0.0,0.0,0.0,0.0,1.0,1.0,16795.0,1502.0,154810.0,-70.0,748.557571,5814.750602,90.436939,17.550969,372.769141,6.0,25.0,61.0,104.0,208.0,580.5,1608.7,1.0,1.0,1827.0,0.237542,97.683928,1.51928,-2.802402,231.489185,1.0,0.195452,0.916562,808.0,0.014255,-16.502835,456.84377,0.0,0.0,0.0,0.0,1.0,1.0,1.0,16993.0,1679.0,165472.0,-17.0,1171.768449,7394.192925,115.015951,14.558343,252.963359,95.0,113.0,151.0,205.0,371.0,1004.4,2783.0,2.0,2.0,1828.0,0.355916,113.139395,1.759872,-2.730022,176.130928,2.0,0.293008,1.102467,1211.0,0.017149,-14.067628,317.544464,0.0,0.0,0.0,0.0,1.0,1.0,1.0,17152.0,1789.0,166558.0,148.0,2018.603002,9902.209557,154.065187,11.484149,156.746404,259.0,283.0,335.0,436.0,753.0,2028.0,5457.5,4.0,4.0,1830.0,0.591382,139.868077,2.176161,-2.55657,120.773877,2.0,0.48826,1.463534,2017.0,0.022771,-12.238208,231.1494,-1.0,0.0,0.0,1.0,1.0,1.0,2.0,17555.0,2210.0,176275.0,521.0,4138.635967,14481.116966,225.443361,8.214385,79.381736,683.0,736.0,838.0,1125.0,1820.75,5844.0,11934.5,9.0,9.0,1835.0,1.172564,182.830849,2.846327,-2.241786,74.908522,4.0,0.976491,2.137726,4029.0,0.03328,-8.690061,113.788627,-1.0,0.0,1.0,1.0,2.0,2.0,2.0,18515.0,3100.0,186522.0,1316.0,8391.069728,20753.2604,323.480907,5.78194,38.758422,1645.75,1750.0,2069.0,2676.0,4888.75,13232.5,31705.25,19.0,19.0,1845.0,2.302721,255.122049,3.976585,-1.601429,37.473548,6.0,1.950194,3.134754,8027.0,0.048861,-5.795481,50.084341,-3.0,0.0,2.0,3.0,3.0,4.0,4.0,27937.0,6007.0,192918.0,3948.0,21238.530103,32479.904942,508.119156,3.444244,13.082416,4934.5,5258.5,6085.5,8988.0,17887.25,60834.5,82182.5,48.0,49.0,1874.0,5.437102,378.628099,5.923299,-1.121043,14.94592,12.0,4.848262,5.022347,19810.0,0.07857,-3.139221,14.682307,-3.0,-1.0,4.0,6.0,8.0,9.0,9.0,34212.0,8806.0,196496.0,5631.0,29839.568864,37854.310535,593.651615,2.769756,8.096371,7369.25,7922.0,9265.25,13632.0,30779.0,76396.0,101446.75,65.0,61.0,1894.0,7.326365,416.10938,6.525651,-1.125418,11.037857,15.0,6.773242,5.832279,27540.0,0.091465,-2.397288,8.522583,-3.0,0.0,5.0,8.0,10.0,12.0,13.0,93473.0,105251.0,207540.0,9783.0,42290.111992,44099.565913,694.158644,2.216486,4.748659,11444.75,12027.5,15167.25,23375.0,48520.0,90628.0,125291.0,57.0,63.0,1924.0,10.047324,461.767997,7.268558,-1.081837,7.724098,20.0,9.679386,6.931371,39066.0,0.109105,-2.03424,6.40513,-2.0,2.0,7.0,11.0,15.0,16.0,17.0,627,5.446571,2.0,2.0,3.0,4.0,7.0,10.0,12.0,25,3.474895,0.000228,0.097679,0.002335,0.000184
3,0059420b,1154,4.490385,2.653938,1,13,2,8,0.184017,2.0,4.0,6.0,1.334632,2.006274,934,81,61,41,17,15,12.0,9.0,8.0,86.846154,33.195999,39,144,99,80,9.206914,62.0,80.0,99.0,0.348766,-0.952509,1129,16.0,6.493587,7,27,17,14,1.800997,11.0,15.0,18.0,0.656055,-0.538051,208,12,10,8.0,3.0,384.0,56.471232,347,449,347,356,32.603681,351.5,356.0,402.5,1.682689,,1152,1.0,0.0,1,1,1,1,0.0,1.0,1.0,1.0,0.0,,3,69.333333,10.214369,62,81,62,65,5.897269,63.5,65.0,73.0,1.565482,,208,0.183464,-2.877375,-4.352001,-47.903508,-60.625511,-inf,1304,151,99,1,1,0,0.271279,-1.645266,-2.851872,-6.840997,-inf,-15.611398,-inf,-18.68781,-27.210578,-inf,-inf,-31.006517,-36.740785,-37.406717,-inf,-inf,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0,0.271339,-1.645266,-2.851872,-6.840997,-inf,-15.611398,-inf,-18.68781,-27.210578,-inf,-inf,-31.006517,-36.740785,-37.406717,-inf,-inf,1038,243,152,68,0,18,0,13,3,0,0,3,2,2,0,0,0.219782,-1.284918,-4.348812,-18.237727,-26.42331,-28.311224,-33.899103,-inf,-inf,-inf,-inf,-38.90166,-inf,-inf,-inf,1146,281,99,13,3,4,3,0,0,0,0,5,0,0,0,18,101.69,0.0,0.131,41.395,178,81,34,32,55,1556,41513,1404469,1404469,41395,1404394,1404394,806,121.848329,113.768226,2.884139,2.426886,7.24904,0.0,1.0,77.0,110.0,142.25,175.0,449.25,5,15,15,10,1047,206,101690.0,453.0,101690.0,-516.0,754.648232,4242.152639,107.577413,18.161553,396.896635,-45.0,-8.8,19.0,131.0,487.5,1236.6,2257.7,0.0,1.0,100.0,0.51254,10.014035,0.253947,-29.384865,1059.896795,1.0,0.132476,0.404853,206.0,0.010267,0.982216,1.829712,0.0,0.0,0.0,0.0,0.0,1.0,1.0,105909.0,675.0,143449.0,1.0,1566.419562,5982.696008,151.764931,15.876355,313.97074,60.0,134.6,304.0,583.5,1231.5,2525.1,4270.5,0.0,2.0,100.0,1.025097,14.134029,0.358542,-20.828134,530.061271,1.0,0.265122,0.504181,412.0,0.01279,0.34739,-0.438832,0.0,0.0,0.0,0.0,1.0,1.0,1.0,106114.0,706.0,147541.0,2.0,2376.37669,7513.130715,190.649301,14.189712,243.137315,271.0,406.4,644.0,1107.0,1956.0,3803.8,6238.4,0.0,3.0,101.0,1.537669,16.880608,0.428354,-18.257973,386.495793,2.0,0.397939,0.578877,618.0,0.014689,-0.155443,-0.47527,0.0,0.0,0.0,0.0,1.0,1.0,1.0,106174.0,1219.0,173922.0,135.0,3999.096712,10015.872168,254.321382,12.206622,178.347829,850.5,1026.0,1456.0,2084.0,3515.0,6297.0,12240.0,0.0,5.0,103.0,2.562863,21.351673,0.542158,-14.918584,247.381901,2.0,0.664088,0.716618,1030.0,0.018196,-0.372247,0.607715,0.0,0.0,0.0,1.0,1.0,1.0,2.0,106330.0,4021.0,176195.0,324.0,8068.27749,14426.535321,366.908099,9.081201,96.473412,2518.5,2881.0,3637.5,4962.0,7171.75,14995.5,22186.0,0.0,8.0,104.0,5.131307,29.650045,0.754086,-11.096493,130.491932,4.0,1.331177,1.013321,2058.0,0.025772,-0.731447,1.434412,0.0,0.0,1.0,1.0,2.0,2.0,3.0,118745.0,8069.0,181450.0,911.0,16189.547526,20459.662791,522.038897,6.40129,46.762671,6266.5,6969.0,8730.5,10770.0,16147.25,28304.5,36005.0,0.0,16.0,112.0,10.276042,41.70224,1.064054,-7.852762,63.57903,6.0,2.669922,1.510366,4101.0,0.038538,-1.246582,3.933125,0.0,1.0,2.0,3.0,4.0,4.0,5.0,122941.0,22866.0,201615.0,14516.0,40953.044489,32311.962989,832.627703,3.623259,14.240499,18218.0,20978.0,24792.0,33249.0,42720.0,62773.5,84054.0,2.0,36.0,142.0,25.857902,64.639535,1.665658,-4.836158,22.642175,12.0,6.735724,2.436273,10144.0,0.062779,-1.637996,5.234524,3.0,4.0,6.0,7.0,8.0,9.0,10.0,139644.0,29757.0,214418.0,23617.0,57743.335801,38224.562008,991.592226,2.834301,8.164326,28659.75,31961.5,36728.0,47406.0,58848.75,94432.5,130740.5,18.0,54.0,160.0,36.168237,76.007557,1.97173,-3.980769,14.690427,15.0,9.453567,2.784311,14048.0,0.072228,-1.357905,3.092786,4.0,7.0,8.0,10.0,11.0,12.0,13.0,160876.0,65122.0,241697.0,38303.0,83031.009615,45828.322027,1201.028377,2.137229,3.987171,43527.75,47534.0,56266.5,68752.5,88167.0,141336.5,221794.25,40.0,59.0,186.0,51.553571,91.42003,2.395856,-3.194528,8.753236,21.0,13.580357,3.264365,19773.0,0.08555,-1.107058,2.150235,6.0,10.0,12.0,14.0,16.0,17.0,18.0,251,4.609562,1.5,2.0,2.0,4.0,6.0,7.0,10.5,19,2.949601,0.000147,0.132391,0.001108,0.000537
4,0075873a,1425,4.509804,2.495449,1,12,11,11,0.156271,3.0,4.0,5.0,1.214738,0.965899,1150,95,61,48,33,21,20.0,12.0,3.0,86.8125,44.09417,22,182,75,22,11.023543,60.0,74.0,106.25,1.031203,0.688993,1389,15.9375,8.667708,3,35,11,3,2.166927,11.0,12.5,18.25,1.148513,0.888421,255,14,13,8.0,5.0,283.4,232.336609,23,627,351,23,103.90409,124.0,292.0,351.0,0.636813,0.237666,1417,1.0,0.0,1,1,1,1,0.0,1.0,1.0,1.0,0.0,0.0,5,51.2,41.829415,3,114,61,3,18.706683,26.0,52.0,61.0,0.68676,0.722916,256,0.163807,-1.270332,-6.348374,-inf,-inf,-inf,1942,517,72,0,0,0,0.226735,-2.04634,-1.271456,-10.228055,-inf,-15.105623,-inf,-18.310694,-18.982904,-inf,-inf,-26.809254,-inf,-26.506522,-inf,-inf,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0,0.226785,-2.04634,-1.271456,-10.228055,-inf,-15.105623,-inf,-18.310694,-18.982904,-inf,-inf,-26.809254,-inf,-26.506522,-inf,-inf,1541,324,517,39,0,33,0,23,24,0,0,10,0,17,0,0,0.235876,-1.54218,-6.348374,-16.246982,-18.206974,-24.826274,-23.358167,-inf,-inf,-47.804364,-inf,-50.427096,-inf,-inf,-inf,1964,397,72,32,25,12,25,0,0,2,0,2,0,0,0,66,110.688,0.0,0.059,78.47,65,24,11,17,71,2531,78693,1662472,1662472,78470,1662390,1662390,701,123.943896,62.082013,1.234013,1.89064,12.709323,28.0,30.0,100.0,129.0,151.5,175.0,193.0,3,11,11,9,1402,252,15539.0,476.0,110688.0,-158.0,502.094862,3896.209237,77.460806,18.182211,416.949167,-62.0,-46.0,-5.0,36.0,128.0,362.1,929.8,0.0,1.0,468.0,0.55415,24.406657,0.48523,-6.667423,351.611662,1.0,0.099605,0.398147,252.0,0.007916,0.813247,2.520214,0.0,0.0,0.0,0.0,0.0,1.0,1.0,15761.0,8227.0,111232.0,-77.0,1122.175563,5619.567427,111.745096,12.684901,199.501664,29.0,31.0,121.0,225.0,386.0,1131.4,3947.2,1.0,2.0,469.0,1.108343,32.152994,0.639362,-6.005992,201.102794,1.0,0.199288,0.527548,504.0,0.01049,0.170817,-0.024888,-1.0,0.0,0.0,0.0,1.0,1.0,1.0,16102.0,9120.0,112046.0,43.0,1739.592959,6949.770563,138.223515,10.209815,128.380038,60.0,66.0,277.75,415.0,713.5,2359.7,6475.0,2.0,3.0,470.0,1.662184,38.385933,0.763455,-5.351486,140.118814,2.0,0.298655,0.618126,755.0,0.012294,-0.22707,-0.558274,-1.0,0.0,0.0,0.0,1.0,1.0,1.0,16539.0,10249.0,112576.0,104.0,2974.63658,9011.742043,179.304867,7.750575,73.463272,120.0,388.0,610.0,819.5,1549.75,5577.5,11665.75,4.0,1.0,497.0,2.769596,48.882417,0.972604,-4.115128,86.428131,2.0,0.496833,0.811666,1255.0,0.01615,-0.483424,-0.089898,-1.0,-1.0,0.0,1.0,1.0,1.0,2.0,17693.0,52426.0,113681.0,260.0,6061.144784,12766.098911,254.256335,5.228734,32.701189,278.0,1128.0,1502.0,2067.0,4636.0,12600.0,25307.0,9.0,498.0,500.0,5.341928,67.734605,1.349038,-2.328305,42.162462,4.0,0.99405,1.33198,2506.0,0.026528,-0.824005,0.148759,-2.0,-1.0,0.0,1.0,2.0,2.0,3.0,25881.0,56924.0,122176.0,559.0,12098.256073,17761.080474,354.442691,3.487648,13.797721,2183.0,2855.0,3452.5,5833.0,11844.0,27009.0,47504.5,18.0,502.0,502.0,8.900438,85.327303,1.702804,-2.077593,19.715236,6.0,1.994026,2.262716,5007.0,0.045155,-1.003232,0.466466,-3.0,-1.0,1.0,3.0,4.0,4.0,5.0,38499.0,70441.0,182071.0,1450.0,30195.106006,29821.932561,598.718119,2.453739,6.943829,8058.0,8893.0,12329.0,18727.0,34418.0,67862.0,91458.0,47.0,449.0,451.0,20.208384,117.154601,2.352047,-1.324722,6.285687,12.0,4.992745,4.600748,12387.0,0.092367,-1.122135,1.070104,-3.0,-1.0,2.0,6.0,9.0,10.0,10.0,55208.0,75315.0,218854.0,2049.0,42354.323852,36955.120674,744.935752,2.305364,6.001206,12464.0,13745.0,18203.0,32253.0,51286.0,78033.0,125497.0,63.0,467.0,471.0,27.604632,122.452268,2.468374,-1.238416,4.798844,16.0,6.975213,6.028236,17166.0,0.121516,-1.176047,1.400926,-3.0,0.0,3.0,9.0,12.0,13.0,14.0,69030.0,82198.0,281981.0,2947.0,60674.779103,45517.876019,923.186636,2.203217,5.75264,18617.0,21733.0,30511.0,49609.0,69145.0,109887.0,157012.0,63.0,495.0,499.0,38.447964,132.4657,2.686649,-1.21137,3.217934,22.0,9.937063,8.019713,24157.0,0.162655,-1.155432,1.529682,-4.0,0.0,5.0,12.0,16.0,18.0,19.0,412,4.76699,1.0,2.0,3.0,4.0,6.0,9.0,11.0,18,2.986064,0.000152,0.099565,0.001522,0.000302


In [27]:
test_feats.head()

Unnamed: 0,id,essay_len,word_len_mean,word_len_std,word_len_min,word_len_max,word_len_first,word_len_last,word_len_sem,word_len_q1,word_len_median,word_len_q3,word_len_skew,word_len_kurt,word_len_sum,word_len_ge_5_count,word_len_ge_6_count,word_len_ge_7_count,word_len_ge_8_count,word_len_ge_9_count,word_len_ge_10_count,word_len_ge_11_count,word_len_ge_12_count,sent_len_mean,sent_len_std,sent_len_min,sent_len_max,sent_len_first,sent_len_last,sent_len_sem,sent_len_q1,sent_len_median,sent_len_q3,sent_len_skew,sent_len_kurt,sent_len_sum,sent_word_count_mean,sent_word_count_std,sent_word_count_min,sent_word_count_max,sent_word_count_first,sent_word_count_last,sent_word_count_sem,sent_word_count_q1,sent_word_count_median,sent_word_count_q3,sent_word_count_skew,sent_word_count_kurt,sent_word_count_sum,sent_len_ge_50_count,sent_len_ge_60_count,sent_len_ge_75_count,sent_len_ge_100_count,paragraph_len_mean,paragraph_len_std,paragraph_len_min,paragraph_len_max,paragraph_len_first,paragraph_len_last,paragraph_len_sem,paragraph_len_q1,paragraph_len_median,paragraph_len_q3,paragraph_len_skew,paragraph_len_kurt,paragraph_len_sum,paragraph_sent_count_mean,paragraph_sent_count_std,paragraph_sent_count_min,paragraph_sent_count_max,paragraph_sent_count_first,paragraph_sent_count_last,paragraph_sent_count_sem,paragraph_sent_count_q1,paragraph_sent_count_median,paragraph_sent_count_q3,paragraph_sent_count_skew,paragraph_sent_count_kurt,paragraph_sent_count_sum,paragraph_word_count_mean,paragraph_word_count_std,paragraph_word_count_min,paragraph_word_count_max,paragraph_word_count_first,paragraph_word_count_last,paragraph_word_count_sem,paragraph_word_count_q1,paragraph_word_count_median,paragraph_word_count_q3,paragraph_word_count_skew,paragraph_word_count_kurt,paragraph_word_count_sum,activity_Input_tfidf_count,activity_Remove/Cut_tfidf_count,activity_Nonproduction_tfidf_count,activity_Replace_tfidf_count,activity_Paste_tfidf_count,activity_move_to_tfidf_count,activity_Input_normal_count,activity_Remove/Cut_normal_count,activity_Nonproduction_normal_count,activity_Replace_normal_count,activity_Paste_normal_count,activity_move_to_normal_count,down_event_q_tfidf_count,down_event_Space_tfidf_count,down_event_Backspace_tfidf_count,down_event_Shift_tfidf_count,down_event_ArrowRight_tfidf_count,down_event_Leftclick_tfidf_count,down_event_ArrowLeft_tfidf_count,down_event_._tfidf_count,"down_event_,_tfidf_count",down_event_ArrowDown_tfidf_count,down_event_ArrowUp_tfidf_count,down_event_Enter_tfidf_count,down_event_CapsLock_tfidf_count,down_event_'_tfidf_count,down_event_Delete_tfidf_count,down_event_Unidentified_tfidf_count,down_event_q_normal_count,down_event_Space_normal_count,down_event_Backspace_normal_count,down_event_Shift_normal_count,down_event_ArrowRight_normal_count,down_event_Leftclick_normal_count,down_event_ArrowLeft_normal_count,down_event_._normal_count,"down_event_,_normal_count",down_event_ArrowDown_normal_count,down_event_ArrowUp_normal_count,down_event_Enter_normal_count,down_event_CapsLock_normal_count,down_event_'_normal_count,down_event_Delete_normal_count,down_event_Unidentified_normal_count,up_event_q_tfidf_count,up_event_Space_tfidf_count,up_event_Backspace_tfidf_count,up_event_Shift_tfidf_count,up_event_ArrowRight_tfidf_count,up_event_Leftclick_tfidf_count,up_event_ArrowLeft_tfidf_count,up_event_._tfidf_count,"up_event_,_tfidf_count",up_event_ArrowDown_tfidf_count,up_event_ArrowUp_tfidf_count,up_event_Enter_tfidf_count,up_event_CapsLock_tfidf_count,up_event_'_tfidf_count,up_event_Delete_tfidf_count,up_event_Unidentified_tfidf_count,up_event_q_normal_count,up_event_Space_normal_count,up_event_Backspace_normal_count,up_event_Shift_normal_count,up_event_ArrowRight_normal_count,up_event_Leftclick_normal_count,up_event_ArrowLeft_normal_count,up_event_._normal_count,"up_event_,_normal_count",up_event_ArrowDown_normal_count,up_event_ArrowUp_normal_count,up_event_Enter_normal_count,up_event_CapsLock_normal_count,up_event_'_normal_count,up_event_Delete_normal_count,up_event_Unidentified_normal_count,text_change_q_tfidf_count,text_change_space_tfidf_count,text_change_NoChange_tfidf_count,text_change_full_stop_tfidf_count,text_change_comma_tfidf_count,text_change_newline_tfidf_count,text_change_single_quote_tfidf_count,text_change_double_quote_tfidf_count,text_change_dash_tfidf_count,text_change_question_mark_tfidf_count,text_change_semicolon_tfidf_count,text_change_equals_tfidf_count,text_change_slash_tfidf_count,text_change_double_backslash_tfidf_count,text_change_colon_tfidf_count,text_change_q_normal_count,text_change_space_normal_count,text_change_NoChange_normal_count,text_change_full_stop_normal_count,text_change_comma_normal_count,text_change_newline_normal_count,text_change_single_quote_normal_count,text_change_double_quote_normal_count,text_change_dash_normal_count,text_change_question_mark_normal_count,text_change_semicolon_normal_count,text_change_equals_normal_count,text_change_slash_normal_count,text_change_double_backslash_normal_count,text_change_colon_normal_count,punct_cnt,largest_lantency,smallest_lantency,median_lantency,initial_pause,pauses_half_sec,pauses_1_sec,pauses_1_half_sec,pauses_2_sec,pauses_3_sec,event_id_max,up_time_first,up_time_last,up_time_max,down_time_first,down_time_last,down_time_max,action_time_max,action_time_mean,action_time_std,action_time_sem,action_time_skew,action_time_kurt,action_time_pct_05,action_time_pct_10,action_time_pct_25,action_time_pct_50,action_time_pct_75,action_time_pct_90,action_time_pct_95,activity_nunique,down_event_nunique,up_event_nunique,text_change_nunique,cursor_position_max,word_count_max,action_time_gap1_first,action_time_gap1_last,action_time_gap1_max,action_time_gap1_min,action_time_gap1_mean,action_time_gap1_std,action_time_gap1_sem,action_time_gap1_skew,action_time_gap1_kurt,action_time_gap1_pct_05,action_time_gap1_pct_10,action_time_gap1_pct_25,action_time_gap1_pct_50,action_time_gap1_pct_75,action_time_gap1_pct_90,action_time_gap1_pct_95,cursor_position_change1_first,cursor_position_change1_last,cursor_position_change1_max,cursor_position_change1_mean,cursor_position_change1_std,cursor_position_change1_sem,cursor_position_change1_skew,cursor_position_change1_kurt,word_count_change1_max,word_count_change1_mean,word_count_change1_std,word_count_change1_sum,word_count_change1_sem,word_count_change1_skew,word_count_change1_kurt,word_count_change1_pct_05,word_count_change1_pct_10,word_count_change1_pct_25,word_count_change1_pct_50,word_count_change1_pct_75,word_count_change1_pct_90,word_count_change1_pct_95,action_time_gap2_first,action_time_gap2_last,action_time_gap2_max,action_time_gap2_min,action_time_gap2_mean,action_time_gap2_std,action_time_gap2_sem,action_time_gap2_skew,action_time_gap2_kurt,action_time_gap2_pct_05,action_time_gap2_pct_10,action_time_gap2_pct_25,action_time_gap2_pct_50,action_time_gap2_pct_75,action_time_gap2_pct_90,action_time_gap2_pct_95,cursor_position_change2_first,cursor_position_change2_last,cursor_position_change2_max,cursor_position_change2_mean,cursor_position_change2_std,cursor_position_change2_sem,cursor_position_change2_skew,cursor_position_change2_kurt,word_count_change2_max,word_count_change2_mean,word_count_change2_std,word_count_change2_sum,word_count_change2_sem,word_count_change2_skew,word_count_change2_kurt,word_count_change2_pct_05,word_count_change2_pct_10,word_count_change2_pct_25,word_count_change2_pct_50,word_count_change2_pct_75,word_count_change2_pct_90,word_count_change2_pct_95,action_time_gap3_first,action_time_gap3_last,action_time_gap3_max,action_time_gap3_min,action_time_gap3_mean,action_time_gap3_std,action_time_gap3_sem,action_time_gap3_skew,action_time_gap3_kurt,action_time_gap3_pct_05,action_time_gap3_pct_10,action_time_gap3_pct_25,action_time_gap3_pct_50,action_time_gap3_pct_75,action_time_gap3_pct_90,action_time_gap3_pct_95,cursor_position_change3_first,cursor_position_change3_last,cursor_position_change3_max,cursor_position_change3_mean,cursor_position_change3_std,cursor_position_change3_sem,cursor_position_change3_skew,cursor_position_change3_kurt,word_count_change3_max,word_count_change3_mean,word_count_change3_std,word_count_change3_sum,word_count_change3_sem,word_count_change3_skew,word_count_change3_kurt,word_count_change3_pct_05,word_count_change3_pct_10,word_count_change3_pct_25,word_count_change3_pct_50,word_count_change3_pct_75,word_count_change3_pct_90,word_count_change3_pct_95,action_time_gap5_first,action_time_gap5_last,action_time_gap5_max,action_time_gap5_min,action_time_gap5_mean,action_time_gap5_std,action_time_gap5_sem,action_time_gap5_skew,action_time_gap5_kurt,action_time_gap5_pct_05,action_time_gap5_pct_10,action_time_gap5_pct_25,action_time_gap5_pct_50,action_time_gap5_pct_75,action_time_gap5_pct_90,action_time_gap5_pct_95,cursor_position_change5_first,cursor_position_change5_last,cursor_position_change5_max,cursor_position_change5_mean,cursor_position_change5_std,cursor_position_change5_sem,cursor_position_change5_skew,cursor_position_change5_kurt,word_count_change5_max,word_count_change5_mean,word_count_change5_std,word_count_change5_sum,word_count_change5_sem,word_count_change5_skew,word_count_change5_kurt,word_count_change5_pct_05,word_count_change5_pct_10,word_count_change5_pct_25,word_count_change5_pct_50,word_count_change5_pct_75,word_count_change5_pct_90,word_count_change5_pct_95,action_time_gap10_first,action_time_gap10_last,action_time_gap10_max,action_time_gap10_min,action_time_gap10_mean,action_time_gap10_std,action_time_gap10_sem,action_time_gap10_skew,action_time_gap10_kurt,action_time_gap10_pct_05,action_time_gap10_pct_10,action_time_gap10_pct_25,action_time_gap10_pct_50,action_time_gap10_pct_75,action_time_gap10_pct_90,action_time_gap10_pct_95,cursor_position_change10_first,cursor_position_change10_last,cursor_position_change10_max,cursor_position_change10_mean,cursor_position_change10_std,cursor_position_change10_sem,cursor_position_change10_skew,cursor_position_change10_kurt,word_count_change10_max,word_count_change10_mean,word_count_change10_std,word_count_change10_sum,word_count_change10_sem,word_count_change10_skew,word_count_change10_kurt,word_count_change10_pct_05,word_count_change10_pct_10,word_count_change10_pct_25,word_count_change10_pct_50,word_count_change10_pct_75,word_count_change10_pct_90,word_count_change10_pct_95,action_time_gap20_first,action_time_gap20_last,action_time_gap20_max,action_time_gap20_min,action_time_gap20_mean,action_time_gap20_std,action_time_gap20_sem,action_time_gap20_skew,action_time_gap20_kurt,action_time_gap20_pct_05,action_time_gap20_pct_10,action_time_gap20_pct_25,action_time_gap20_pct_50,action_time_gap20_pct_75,action_time_gap20_pct_90,action_time_gap20_pct_95,cursor_position_change20_first,cursor_position_change20_last,cursor_position_change20_max,cursor_position_change20_mean,cursor_position_change20_std,cursor_position_change20_sem,cursor_position_change20_skew,cursor_position_change20_kurt,word_count_change20_max,word_count_change20_mean,word_count_change20_std,word_count_change20_sum,word_count_change20_sem,word_count_change20_skew,word_count_change20_kurt,word_count_change20_pct_05,word_count_change20_pct_10,word_count_change20_pct_25,word_count_change20_pct_50,word_count_change20_pct_75,word_count_change20_pct_90,word_count_change20_pct_95,action_time_gap50_first,action_time_gap50_last,action_time_gap50_max,action_time_gap50_min,action_time_gap50_mean,action_time_gap50_std,action_time_gap50_sem,action_time_gap50_skew,action_time_gap50_kurt,action_time_gap50_pct_05,action_time_gap50_pct_10,action_time_gap50_pct_25,action_time_gap50_pct_50,action_time_gap50_pct_75,action_time_gap50_pct_90,action_time_gap50_pct_95,cursor_position_change50_first,cursor_position_change50_last,cursor_position_change50_max,cursor_position_change50_mean,cursor_position_change50_std,cursor_position_change50_sem,cursor_position_change50_skew,cursor_position_change50_kurt,word_count_change50_max,word_count_change50_mean,word_count_change50_std,word_count_change50_sum,word_count_change50_sem,word_count_change50_skew,word_count_change50_kurt,word_count_change50_pct_05,word_count_change50_pct_10,word_count_change50_pct_25,word_count_change50_pct_50,word_count_change50_pct_75,word_count_change50_pct_90,word_count_change50_pct_95,action_time_gap70_first,action_time_gap70_last,action_time_gap70_max,action_time_gap70_min,action_time_gap70_mean,action_time_gap70_std,action_time_gap70_sem,action_time_gap70_skew,action_time_gap70_kurt,action_time_gap70_pct_05,action_time_gap70_pct_10,action_time_gap70_pct_25,action_time_gap70_pct_50,action_time_gap70_pct_75,action_time_gap70_pct_90,action_time_gap70_pct_95,cursor_position_change70_first,cursor_position_change70_last,cursor_position_change70_max,cursor_position_change70_mean,cursor_position_change70_std,cursor_position_change70_sem,cursor_position_change70_skew,cursor_position_change70_kurt,word_count_change70_max,word_count_change70_mean,word_count_change70_std,word_count_change70_sum,word_count_change70_sem,word_count_change70_skew,word_count_change70_kurt,word_count_change70_pct_05,word_count_change70_pct_10,word_count_change70_pct_25,word_count_change70_pct_50,word_count_change70_pct_75,word_count_change70_pct_90,word_count_change70_pct_95,action_time_gap100_first,action_time_gap100_last,action_time_gap100_max,action_time_gap100_min,action_time_gap100_mean,action_time_gap100_std,action_time_gap100_sem,action_time_gap100_skew,action_time_gap100_kurt,action_time_gap100_pct_05,action_time_gap100_pct_10,action_time_gap100_pct_25,action_time_gap100_pct_50,action_time_gap100_pct_75,action_time_gap100_pct_90,action_time_gap100_pct_95,cursor_position_change100_first,cursor_position_change100_last,cursor_position_change100_max,cursor_position_change100_mean,cursor_position_change100_std,cursor_position_change100_sem,cursor_position_change100_skew,cursor_position_change100_kurt,word_count_change100_max,word_count_change100_mean,word_count_change100_std,word_count_change100_sum,word_count_change100_sem,word_count_change100_skew,word_count_change100_kurt,word_count_change100_pct_05,word_count_change100_pct_10,word_count_change100_pct_25,word_count_change100_pct_50,word_count_change100_pct_75,word_count_change100_pct_90,word_count_change100_pct_95,input_word_count,input_word_length_mean,input_word_length_pct_5,input_word_length_pct_10,input_word_length_pct_25,input_word_length_pct_50,input_word_length_pct_75,input_word_length_pct_90,input_word_length_pct_95,input_word_length_max,input_word_length_std,word_time_ratio,word_event_ratio,event_time_ratio,idle_time_ratio
0,0000aaaa,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2.0,,2,2,2,2,,2.0,2.0,2.0,,,2,1.0,,1,1,1,1,,1.0,1.0,1.0,,,1,3.0,,3,3,3,3,,3.0,3.0,3.0,,,3,-0.154151,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,-inf,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-inf,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-inf,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,421.555,0.0,210.7775,338.433,0,0,0,0,1,2,338518,760160,760160,338433,760073,760073,87,86.0,1.414214,1.0,,,85.1,85.2,85.5,86.0,86.5,86.8,86.9,1,1,1,1,1,0,421555.0,421555.0,421555.0,421555.0,421555.0,,,,,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,421555.0,1.0,1.0,1.0,1.0,,,,,0.0,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,3e-06,0.554561
1,2222bbbb,2,2.0,,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,,,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,2.0,2.0,2.0,2.0,,2.0,2.0,2.0,,,2.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,,,1.0,0.0,0.0,0.0,0.0,2.0,,2,2,2,2,,2.0,2.0,2.0,,,2,1.0,,1,1,1,1,,1.0,1.0,1.0,,,1,1.0,,1,1,1,1,,1.0,1.0,1.0,,,1,-0.154151,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.405465,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,421.521,0.0,210.7605,711.956,0,0,0,0,1,2,712023,290548,712023,711956,290502,711956,67,56.5,14.849242,10.5,,,47.05,48.1,51.25,56.5,61.75,64.9,65.95,1,1,1,1,1,1,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,,,,,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,-421521.0,1.0,1.0,1.0,1.0,,,,,0.0,0.0,,0.0,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2,0.0,1e-06,0.5,3e-06,-0.592005
2,4444cccc,2,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,,,1.0,1.0,,1.0,1.0,1.0,1.0,,1.0,1.0,1.0,,,1.0,0.0,0.0,0.0,0.0,2.0,,2,2,2,2,,2.0,2.0,2.0,,,2,1.0,,1,1,1,1,,1.0,1.0,1.0,,,1,2.0,,2,2,2,2,,2.0,2.0,2.0,,,2,-0.154151,-inf,-inf,-inf,-inf,-inf,2,0,0,0,0,0,0.124418,0.124418,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.124418,0.124418,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.124418,0.124418,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,-inf,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,450.645,0.0,225.3225,635.547,0,0,0,0,1,2,635641,185052,635641,635547,184996,635547,94,75.0,26.870058,19.0,,,57.9,59.8,65.5,75.0,84.5,90.2,92.1,1,2,2,2,1,1,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,,,,,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,-450645.0,1.0,1.0,1.0,1.0,,,,,1.0,1.0,,1.0,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,0.0,2e-06,0.5,3e-06,-0.708962


In [28]:
train_feats = train_feats.merge(train_scores, on='id', how='left')

## Split Data Into Folds

In [29]:
kfold = KFold(n_splits=CONFIG.num_folds, shuffle=True, random_state=CONFIG.seed)

for fold, (_, val_idx) in enumerate(kfold.split(train_feats)):
    train_feats.loc[val_idx, "fold"] = fold

## Modeling

In [30]:
class WritingQualityModel:
    
    def __init__(self, model_name: str, params: dict):
        self.model_name = model_name
        self.model = self.create_model(model_name, params)
        
    def make_pipeline(self, model: str):
        return Pipeline([
            ('remove_infs', FunctionTransformer(lambda x: np.nan_to_num(x, nan=np.nan, posinf=0, neginf=0))),
            ('imputer', SimpleImputer(strategy='mean')),
            ('nomalizer', FunctionTransformer(lambda x: np.log1p(np.abs(x)))),
            ('scaler', RobustScaler()),
            ('model', model)
        ])
    
    def create_model(self, model_name: str, params: dict):
        model = None
        if 'lgbm' in model_name:
            model = lgb.LGBMRegressor(**params)
        elif 'cat' in model_name:
            model = cb.CatBoostRegressor(**params)
        elif 'rfr' in model_name:
            model = RandomForestRegressor(**params)
        elif 'lasso' in model_name:
            model = self.make_pipeline(Lasso(**params))
        return model
    
    def train(self, X_train, Y_train, X_val, Y_val):
        if any(x in self.model_name for x in ['lgbm']):
            early_stopping_callback = lgb.early_stopping(CONFIG.num_trials_early_stopping, first_metric_only=True, verbose=False)
            
            self.model.fit(X_train, Y_train,
                          eval_set=[(X_val, Y_val)],
                          eval_metric='rmse', verbose=0,
                          callbacks=[early_stopping_callback])
        
        elif any(x in self.model_name for x in ['cat']):
            self.model.fit(X_train, Y_train,
                          eval_set=[(X_val, Y_val)],
                          verbose=0,
                          early_stopping_rounds=CONFIG.num_trials_early_stopping)
        else:
            X_train = np.nan_to_num(X_train, posinf=-1, neginf=-1)
            self.model.fit(X_train, Y_train)
            
        return self.model
    
    def validate(self, X_val, Y_val):
        if any(x in self.model_name for x in ['rfr', 'ridge', 'lasso']):
            X_val = np.nan_to_num(X_val, posinf=-1, neginf=-1)
            
        pred = self.model.predict(X_val)
        score = mean_squared_error(pred, Y_val, squared=False)
        return pred, score
    
    def predict(self, X_test):
        if any(x in self.model_name for x in ['rfr', 'ridge', 'lasso']):
            X_test = np.nan_to_num(X_test, posinf=-1, neginf=-1)
            
        return self.model.predict(X_test)

## (3) LightGBM train and predic

In [None]:
OOF_PREDS = np.zeros((len(train_feats), 2))
TEST_PREDS = np.zeros((len(test_feats), 2))

In [None]:
# Code comes from here: https://www.kaggle.com/code/abdullahmeda/enter-ing-the-timeseries-space-sec-3-new-aggs

models_dict = {}
scores = []

test_predict_list = []
best_params = {'reg_alpha': 0.007678095440286993, 
               'reg_lambda': 0.34230534302168353, 
               'colsample_bytree': 0.627061253588415, 
               'subsample': 0.854942238828458, 
               'learning_rate': 0.038697981947473245, 
               'num_leaves': 22, 
               'max_depth': 37, 
               'min_child_samples': 18,
               'n_jobs':4
              }

for i in range(5): 
    kf = model_selection.KFold(n_splits=10, random_state=42 + i, shuffle=True)
    oof_valid_preds = np.zeros(train_feats.shape[0])
    X_test = test_feats[train_cols]
    for fold, (train_idx, valid_idx) in enumerate(kf.split(train_feats)):
        
        X_train, y_train = train_feats.iloc[train_idx][train_cols], train_feats.iloc[train_idx][target_col]
        X_valid, y_valid = train_feats.iloc[valid_idx][train_cols], train_feats.iloc[valid_idx][target_col]
        params = {
            "objective": "regression",
            "metric": "rmse",
            'random_state': 42,
            "n_estimators" : 12001,
            "verbosity": -1,
            **best_params
        }
        model = lgb.LGBMRegressor(**params)
        early_stopping_callback = lgb.early_stopping(200, first_metric_only=True, verbose=False)
        
        model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],  
                  callbacks=[early_stopping_callback],
        )
        valid_predict = model.predict(X_valid)
        oof_valid_preds[valid_idx] = valid_predict
        OOF_PREDS[valid_idx, 0] += valid_predict / 5
        test_predict = model.predict(X_test)
        TEST_PREDS[:, 0] += test_predict / 5 / 10
        test_predict_list.append(test_predict)
        score = metrics.mean_squared_error(y_valid, valid_predict, squared=False)
        models_dict[f'{fold}_{i}'] = model

    oof_score = metrics.mean_squared_error(train_feats[target_col], oof_valid_preds, squared=False)
    scores.append(oof_score)

In [None]:
print('OOF metric LGBM = {:.5f}'.format(metrics.mean_squared_error(train_feats[target_col], OOF_PREDS[:, 0], squared=False)))

## (4) LightAutoML NN (DenseLight) prediction

In [None]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
import joblib

In [None]:
for i in range(3):
    oof_pred, automl = joblib.load('/kaggle/input/linkinglamamodels/oof_and_lama_denselight_{}.pkl'.format(i))
    OOF_PREDS[:, 1] += oof_pred / 3
    TEST_PREDS[:, 1] += automl.predict(test_feats[train_cols]).data[:, 0] / 3

In [None]:
print('OOF metric LightAutoML_NN = {:.5f}'.format(metrics.mean_squared_error(train_feats[target_col], OOF_PREDS[:, 1], squared=False)))

## (5) Blending

In [None]:
best_sc = 1
for w in np.arange(0, 1.01, 0.001):
    sc = metrics.mean_squared_error(train_feats[target_col], 
                                    w * OOF_PREDS[:, 0] + (1-w) * OOF_PREDS[:, 1], 
                                    squared=False)
    if sc < best_sc:
        best_sc = sc
        best_w = w
        
print('Composition OOF score = {:.5f}'.format(best_sc))
print('Composition best W = {:.3f}'.format(best_w))

## (6) Submission creation

In [None]:
W = [best_w, 1 - best_w]
test_preds = TEST_PREDS[:, 0] * W[0] + TEST_PREDS[:, 1] * W[1]
test_preds

In [None]:
test_feats['score'] = test_preds
test_feats[['id', 'score']].to_csv("submission.csv", index=False)

In [None]:
test_feats[['id', 'score']]