In [15]:
import numpy as np
import pandas as pd

import os
import warnings
import string

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.linear_model import LinearRegression

from lightgbm import LGBMRegressor

warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv


In [16]:
config = {
    'redundant_features': ['up_event'],
    'feature_rename': {
        'down_event': 'event_type'
    }
}

print('Reading X Train Data!')
input_train_dataset = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')

print('Reading X Test Data!')
input_test_dataset = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

print('Reading Y Train Data!')
y_train = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')

Reading X Train Data!
Reading X Test Data!
Reading Y Train Data!


In [17]:
def get_essay_paragh(dataframe: pd.DataFrame) -> pd.Series:
    textInputDf = dataframe[['id', 'activity', 'cursor_position', 'text_change']].copy()
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']

    def apply_actions(group):
        essayText = ""
        for activity, cursor_position, text_change in zip(group['activity'], group['cursor_position'], group['text_change']):
            if activity == 'Replace':
                replaceTxt = text_change.split(' => ')
                essayText = essayText[:cursor_position - len(replaceTxt[1])] + replaceTxt[1] + essayText[cursor_position - len(replaceTxt[1]) + len(replaceTxt[0]):]
            elif activity == 'Paste':
                essayText = essayText[:cursor_position - len(text_change)] + text_change + essayText[cursor_position - len(text_change):]
            elif activity == 'Remove/Cut':
                essayText = essayText[:cursor_position] + essayText[cursor_position + len(text_change):]
            elif "M" in activity:
                move_info = activity[activity.index('[') + 1:activity.index(']')]
                move_from, move_to = [int(val) for val in move_info.split(',')]
                if move_from != move_to:
                    if move_from < move_to:
                        essayText = essayText[:move_from] + essayText[move_to:move_to + len(text_change)] + essayText[move_from:move_to] + essayText[move_to + len(text_change):]
                    else:
                        essayText = essayText[:move_to] + essayText[move_from:move_from + len(text_change)] + essayText[move_to:move_from] + essayText[move_from + len(text_change):]
            else:
                essayText = essayText[:cursor_position - len(text_change)] + text_change + essayText[cursor_position - len(text_change):]
        return essayText

    essaySeries = textInputDf.groupby('id').apply(apply_actions).to_frame().rename(columns={0: 'essay'}).squeeze()

    return essaySeries

def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

def split_and_aggregate_sentences(df):
    AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

    df['sent'] = df['essay'].str.split('\\.|\\?|\\!')
    df = df.explode('sent')
    df['sent'] = df['sent'].str.replace('\n', '').str.strip()
    df['sent_len'] = df['sent'].str.len()
    df['sent_word_count'] = df['sent'].str.split().str.len()

    grouped = df.groupby('id')
    agg_df = pd.concat([
        grouped[['sent_len']].agg(AGGREGATIONS),
        grouped[['sent_word_count']].agg(AGGREGATIONS)
    ], axis=1)

    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns]
    agg_df.index.name = 'id'
    agg_df = agg_df.drop(columns=['sent_word_count_count'])
    agg_df = agg_df.rename(columns={'sent_len_count': 'sent_count'})

    return agg_df

def split_and_aggregate_paragraphs(df):
    AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

    df['paragraph'] = df['essay'].str.split('\n')
    df = df.explode('paragraph')

    df['paragraph_len'] = df['paragraph'].str.len() 
    df['paragraph_word_count'] = df['paragraph'].str.split().str.len()

    grouped = df.groupby('id')
    agg_df = pd.concat([
        grouped[['paragraph_len']].agg(AGGREGATIONS),
        grouped[['paragraph_word_count']].agg(AGGREGATIONS)
    ], axis=1)

    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns] 
    agg_df.index.name = 'id' 
    agg_df = agg_df.drop(columns=['paragraph_word_count_count']) 
    agg_df = agg_df.rename(columns={'paragraph_len_count': 'paragraph_count'})

    return agg_df

def get_sentance_level_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    paragh_data = get_essay_paragh(dataframe.copy())
    paragh_data = pd.DataFrame({'id': paragh_data.index, 'essay': paragh_data.values})
    
    sentance_agg = split_and_aggregate_sentences(paragh_data.copy())
    paragh_agg = split_and_aggregate_paragraphs(paragh_data.copy())

    master_data = pd.merge(sentance_agg, paragh_agg, on='id')

    return master_data

def get_activity_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
    unidentified_columns = [
        '\x80', '\x96', '\x97', '\x9b', '¡', '¿', 'Â´', 'Ä±', 'Å\x9f', 'Ë\x86', 'â\x80\x93', 'ä', 'Unidentified', 'Dead', '0', 
        '1', '2', '5', 'AltGraph', 'Cancel', 'Clear', 'Meta', 'ContextMenu', 'ModeChange', 'OS', 'Pause', 'Process']
    function_clicks = ['F1', 'F10', 'F11', 'F12', 'F15', 'F2', 'F3', 'F6']
    mouse_clicks = ['Leftclick', 'Unknownclick', 'Rightclick', 'Middleclick']
    keyboard_clicks = [
    'Alt', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'ArrowUp', 'Backspace', 'CapsLock', 
    'Control','Delete', 'End', 'Enter', 'Escape', 'Home', 'Insert', 'NumLock', 'PageDown', 'PageUp', 
    'ScrollLock', 'Shift', 'Space', 'Tab']
    redundent_activity = [
    'AudioVolumeDown', 'AudioVolumeMute', 'AudioVolumeUp','MediaPlayPause', 'MediaTrackNext', 'MediaTrackPrevious']
    
    dataframe = dataframe.groupby(['id', 'down_event']).size().reset_index(name='count')
    dataframe = dataframe.pivot_table(index='id', columns='down_event', values='count', fill_value=0).reset_index()
    
    punct_columns = dataframe.columns[dataframe.columns.isin(list(string.punctuation))]
    input_columns = dataframe.columns[dataframe.columns.isin(list(string.ascii_lowercase) + list(string.ascii_uppercase))]
    unidnty_columns = dataframe.columns[dataframe.columns.isin(unidentified_columns)]
    func_columns = dataframe.columns[dataframe.columns.isin(function_clicks)]
    mouse_columns = dataframe.columns[dataframe.columns.isin(mouse_clicks)]
    keyboard_columns = dataframe.columns[dataframe.columns.isin(keyboard_clicks)]
    redundant_columns = dataframe.columns[dataframe.columns.isin(redundent_activity)]
    
    dataframe['punctuation'] = dataframe[punct_columns].sum(axis=1)
    dataframe['inputs'] = dataframe[input_columns].sum(axis=1)
    dataframe['unidentified'] = dataframe[unidnty_columns].sum(axis=1)
    dataframe['functions'] = dataframe[func_columns].sum(axis=1)
    dataframe['mouse_clicks'] = dataframe[mouse_columns].sum(axis=1)
    dataframe['keyboard_clicks'] = dataframe[keyboard_columns].sum(axis=1)
    dataframe['redundant'] = dataframe[redundant_columns].sum(axis=1)

    columns_to_drop = list(punct_columns) + list(input_columns) + list(unidnty_columns) + list(func_columns) + list(mouse_columns) + list(keyboard_columns) + list(redundant_columns) 
    
    dataframe = dataframe.drop(columns=columns_to_drop)
    dataframe = dataframe[[
        'id', 'punctuation', 'inputs', 'unidentified', 'functions', 'mouse_clicks', 'keyboard_clicks', 'redundant']]

    return dataframe.reset_index(drop=True)

In [18]:
def get_clean_data(X: pd.DataFrame, feature_list: list, rename_dict: dict) -> pd.DataFrame:
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'down_event'] = 'NoEvent'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'up_event'] = 'NoEvent'
    
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Input'), 'up_event'] = 'q'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Replace'), 'up_event'] = 'q'

    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'MoveSection'

    X = X.drop(columns=feature_list)
    X = X.rename(columns=rename_dict)

    return X

def rounded_rmse(y, y_pred, **kwargs):
    return mean_squared_error(y, np.round(y_pred * 2) / 2, squared=False)

In [19]:
class FeatureEngineering:

    @staticmethod
    def get_capitalized_letters(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_event_type'] = X['event_type'].shift()
        X['capitalize_letters'] = (X['activity'] == 'Input') & (X['previous_event_type'] == 'Shift') & (X['event_type'] == 'q')
        
        X = X.drop(columns=['previous_event_type'])
        
        return X

    @staticmethod
    def get_temporal_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_up_time'] = X['up_time'].shift().fillna(X['down_time'].iloc[0])
        X['time_between_events'] = X['down_time'] - X['previous_up_time']
        
        X['cumulative_writing_time'] = (X['action_time'] + X['time_between_events']).cumsum()

        X['warning_issued'] = X['time_between_events'] >= 120000
        X = X.drop(columns=['previous_up_time'])
        
        return X

    @staticmethod
    def get_cursor_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_cursor_position'] = X['cursor_position'].shift().fillna(0)
        X['cursor_move_distance'] = X['cursor_position'] - X['previous_cursor_position']
        X['cursor_move_distance'] = X['cursor_move_distance'].abs()

        X = X.drop(columns=['previous_cursor_position'])

        return X

    @staticmethod
    def get_word_change_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_word_count'] = X['word_count'].shift().fillna(0)
        X['word_count_change'] = X['word_count'] - X['previous_word_count']
        X['word_count_change'] = X['word_count_change'].abs()

        X = X.drop(columns=['previous_word_count'])

        return X

In [20]:
def calculate_features(unique_dataset):
    feature_list = [
        'id', 'total_number_of_events', 'final_number_of_words', 'number_of_warnings_issued',
        'total_time_taken', 'total_pause_time', 'average_pause_length', 'proportion_pause_time',
        'non_productive_events', 'input_events', 'deletion_events', 'addition_events', 'replacement_events', 'string_move_events',
        'number_of_sentences', 'average_action_time', 'median_action_time', 'min_action_time', 'max_action_time',
        'std_action_time', 'sum_action_time', 'average_cursor_distance', 'max_cursor_distance', 'total_cursor_distance', 'std_cursor_distance', 
        'avg_word_count_btw_events', 'min_time_between_events', 'max_time_between_events', 'std_time_between_events'
    ]
    
    data_values = []

    data_values.append(unique_dataset['id'].iloc[0])
    data_values.append(unique_dataset['event_id'].iloc[-1])
    data_values.append(unique_dataset['word_count'].iloc[-1])
    data_values.append(unique_dataset['warning_issued'].sum())
    data_values.append(unique_dataset['cumulative_writing_time'].iloc[-1])
    data_values.append(unique_dataset['time_between_events'].sum())

    data_values.append(unique_dataset['time_between_events'].mean())
    data_values.append(unique_dataset['time_between_events'].sum() / unique_dataset['cumulative_writing_time'].iloc[-1])

    data_values.extend([
        unique_dataset[unique_dataset['activity'] == 'Nonproduction'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Input'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Remove/Cut'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Paste'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Replace'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'MoveSection'].shape[0],
    ])

    data_values.append(unique_dataset[unique_dataset['event_type'] == '.'].shape[0])
    data_values.append(unique_dataset['action_time'].mean())
    data_values.append(unique_dataset['action_time'].median())
    data_values.append(unique_dataset['action_time'].min())
    data_values.append(unique_dataset['action_time'].max())
    data_values.append(unique_dataset['action_time'].std())
    data_values.append(unique_dataset['action_time'].sum())

    data_values.append(unique_dataset['cursor_move_distance'].mean())
    data_values.append(unique_dataset['cursor_move_distance'].max())
    data_values.append(unique_dataset['cursor_move_distance'].sum())
    data_values.append(unique_dataset['cursor_move_distance'].std())
    
    data_values.append(unique_dataset['word_count_change'].mean())
    
    data_values.append(unique_dataset['time_between_events'].min())
    data_values.append(unique_dataset['time_between_events'].max())
    data_values.append(unique_dataset['time_between_events'].std())

    return pd.Series(data_values, index=feature_list)

In [21]:
def create_master_data(input_data: pd.DataFrame, config: dict) -> pd.DataFrame:
    print('Cleaning Train Dataset!')
    
    sent_paragh_data = get_sentance_level_data(input_data.copy())
    activity_count_data = get_activity_counts(input_data.copy())
    
    cleaned_data = get_clean_data(input_data, config['redundant_features'], config['feature_rename'])

    print('Preprocessing Train Data!')
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_capitalized_letters)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_temporal_features)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_cursor_features)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_word_change_features)

    master_data = cleaned_data.groupby('id').apply(calculate_features).reset_index(drop=True)

    master_data = pd.merge(master_data, sent_paragh_data, on='id')
    master_data = pd.merge(master_data, activity_count_data, on='id')

    master_data['total_writing_time'] = master_data['total_time_taken'] - master_data['total_pause_time']

    master_data['proportion_np_events'] = master_data['non_productive_events'] / master_data['total_number_of_events']
    master_data['proportion_input_events'] = master_data['input_events'] / master_data['total_number_of_events']
    master_data['proportion_delete_events'] = master_data['deletion_events'] / master_data['total_number_of_events']
    master_data['proportion_addition_events'] = master_data['addition_events'] / master_data['total_number_of_events']
    master_data['proportion_replace_events'] = master_data['replacement_events'] / master_data['total_number_of_events']
    master_data['proportion_moving_events'] = master_data['string_move_events'] / master_data['total_number_of_events']

    print('Preprocessing Complete!')
    
    return master_data

In [22]:
def remove_corr_features(dataframe: pd.DataFrame) -> list:
    corr_matrix = dataframe.corr().abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    
    return to_drop

In [23]:
print('Train Data!')

master_data = create_master_data(input_data=input_train_dataset, config=config)
master_data = pd.merge(master_data, y_train, on='id')

print('Test Data!')

master_data_test = create_master_data(input_data=input_test_dataset, config=config)

Train Data!
Cleaning Train Dataset!
Preprocessing Train Data!
Preprocessing Complete!
Test Data!
Cleaning Train Dataset!
Preprocessing Train Data!
Preprocessing Complete!


In [24]:
print('Creating X and y Dataframes!')

master_data = master_data.set_index('id')
master_data_test = master_data_test.set_index('id')

y = master_data['score']
X = master_data.drop(columns=['score'])

drop_columns = remove_corr_features(X)
X = X.drop(columns=drop_columns)
master_data_test = master_data_test.drop(columns=drop_columns)

print(f'No. of Features: {X.shape[1]}')
print(f'Feature List: {X.columns}')

scalar = StandardScaler()
transformer = PowerTransformer()

X_train = transformer.fit_transform(scalar.fit_transform(X))
X_test = transformer.transform(scalar.transform(master_data_test))

Creating X and y Dataframes!
No. of Features: 56
Feature List: Index(['total_number_of_events', 'final_number_of_words',
       'average_pause_length', 'proportion_pause_time',
       'non_productive_events', 'input_events', 'deletion_events',
       'addition_events', 'replacement_events', 'string_move_events',
       'number_of_sentences', 'average_action_time', 'median_action_time',
       'min_action_time', 'max_action_time', 'sum_action_time',
       'average_cursor_distance', 'max_cursor_distance',
       'total_cursor_distance', 'std_cursor_distance',
       'avg_word_count_btw_events', 'min_time_between_events',
       'max_time_between_events', 'std_time_between_events', 'sent_count',
       'sent_len_mean', 'sent_len_min', 'sent_len_max', 'sent_len_first',
       'sent_len_last', 'sent_len_q1', 'sent_len_median', 'paragraph_count',
       'paragraph_len_mean', 'paragraph_len_min', 'paragraph_len_max',
       'paragraph_len_first', 'paragraph_len_last', 'paragraph_len_q1',
   

In [25]:
class RegressorEnsemble(BaseEstimator, RegressorMixin):

    def __init__(self, model_params: dict, models_list: list = None):
        self.models_list = [
            ('gbr', GradientBoostingRegressor(random_state=0, **model_params['gbr'])),
            ('rfr', RandomForestRegressor(random_state=0, **model_params['rfr'])),
            ('lgbm', LGBMRegressor(random_state=0, **model_params['lgbm'])),
        ] if models_list is None else models_list
        
        self.blending_model = None

    def fit(self, X, y=None):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=0)
        meta_X = list()
        
        for _, model_object in self.models_list:
            model_object.fit(X_train, y_train)
            yhat = model_object.predict(X_val)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        self.blending_model = LinearRegression().fit(np.hstack(meta_X), y_val)
        
        return self
    
    def predict(self, X, y=None):
        meta_X = list()
        
        for _, model_object in self.models_list:
            yhat = model_object.predict(X)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        return self.blending_model.predict(np.hstack(meta_X))

In [26]:
print('Building Model Object!')

model_params = {'gbr': {'n_estimators': 409,
  'learning_rate': 0.07361180235738161,
  'max_depth': 10,
  'min_samples_split': 0.6482497101518737,
  'min_samples_leaf': 0.31972085330435496,
  'subsample': 0.9055662628017166},
 'rfr': {'n_estimators': 425,
  'max_depth': 9,
  'min_samples_split': 0.4596197102513105,
  'min_samples_leaf': 0.22695246564074448},
 'lgbm': {'boosting_type': 'dart',
  'n_estimators': 115,
  'learning_rate': 0.0221079754217087,
  'num_leaves': 96,
  'max_depth': 12,
  'min_child_samples': 7,
  'subsample': 0.9063815116324397,
  'colsample_bytree': 0.5191190249507341,
  'reg_alpha': 0.7733601728487565,
  'reg_lambda': 0.745926710711842}}

reg_model = RegressorEnsemble(model_params=model_params).fit(X_train, y)

y_hat_train = np.round(reg_model.predict(X_train), 3)
y_hat_test = np.round(reg_model.predict(X_test), 3)

submission_data = pd.DataFrame({'id': master_data_test.index, 'score': y_hat_test})

print('Previous Best:: R^2 Score: 0.861, RMSE Score: 0.382, Rounded RMSE Score: 0.39')

print(f'R^2 Score: {round(reg_model.score(X_train, y), 3)},', 
      f'RMSE Score: {round(mean_squared_error(y, y_hat_train, squared=False), 3)},',
      f'Rounded RMSE Score: {round(rounded_rmse(y, y_hat_train), 3)}')

Building Model Object!
Previous Best:: R^2 Score: 0.861, RMSE Score: 0.382, Rounded RMSE Score: 0.39
R^2 Score: 0.667, RMSE Score: 0.592, Rounded RMSE Score: 0.616


In [27]:
submission_data

Unnamed: 0,id,score
0,0000aaaa,1.565
1,2222bbbb,1.528
2,4444cccc,1.528


In [28]:
submission_data.to_csv('submission.csv', index=False)