In [136]:
import numpy as np
import pandas as pd

import os
import warnings

from tqdm import tqdm
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.base import TransformerMixin, BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression

from lightgbm import LGBMRegressor

warnings.filterwarnings('ignore')

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv
/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv
/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv


In [137]:
print('Read Train and Test Data Files!')

config = {
    'redundant_features': ['up_event'],
    'feature_rename': {
        'down_event': 'event_type'
    }
}

input_train_dataset = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv')
input_test_dataset = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv')

y_train = pd.read_csv('/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv')

Read Train and Test Data Files!


In [138]:
def get_clean_data(X: pd.DataFrame, feature_list: list, rename_dict: dict) -> pd.DataFrame:
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'down_event'] = 'NoEvent'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'up_event'] = 'NoEvent'
    
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Input'), 'up_event'] = 'q'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Replace'), 'up_event'] = 'q'

    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'MoveSection'

    X = X.drop(columns=feature_list)
    X = X.rename(columns=rename_dict)

    return X

def rounded_rmse(y, y_pred, **kwargs):
    return mean_squared_error(y, np.round(y_pred * 2) / 2, squared=False)

In [139]:
class FeatureEngineering:

    @staticmethod
    def get_capitalized_letters(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_event_type'] = X['event_type'].shift()
        X['capitalize_letters'] = (X['activity'] == 'Input') & (X['previous_event_type'] == 'Shift') & (X['event_type'] == 'q')
        
        X = X.drop(columns=['previous_event_type'])
        
        return X

    @staticmethod
    def get_temporal_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_up_time'] = X['up_time'].shift().fillna(X['down_time'].iloc[0])
        X['time_between_events'] = X['down_time'] - X['previous_up_time']
        
        X['cumulative_writing_time'] = (X['action_time'] + X['time_between_events']).cumsum()

        X['warning_issued'] = X['time_between_events'] >= 120000
        X = X.drop(columns=['previous_up_time'])
        
        return X

    @staticmethod
    def get_cursor_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_cursor_position'] = X['cursor_position'].shift().fillna(0)
        X['cursor_move_distance'] = X['cursor_position'] - X['previous_cursor_position']
        X['cursor_move_distance'] = X['cursor_move_distance'].abs()

        X = X.drop(columns=['previous_cursor_position'])

        return X

    @staticmethod
    def get_word_change_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_word_count'] = X['word_count'].shift().fillna(0)
        X['word_count_change'] = X['word_count'] - X['previous_word_count']
        X['word_count_change'] = X['word_count_change'].abs()

        X = X.drop(columns=['previous_word_count'])

        return X

In [140]:
def calculate_features(unique_dataset):
    feature_list = [
        'id', 'total_number_of_events', 'final_number_of_words', 'number_of_warnings_issued',
        'total_time_taken', 'total_pause_time', 'average_pause_length', 'proportion_pause_time',
        'non_productive_events', 'input_events', 'deletion_events', 'addition_events', 'replacement_events', 'string_move_events',
        'number_of_sentences', 'average_action_time', 'median_action_time', 'min_action_time', 'max_action_time',
        'std_action_time', 'average_cursor_distance', 'avg_word_count_btw_events', 'total_mouse_clicks',
        'total_arrow_btn_clicks', 'average_time_between_events'
    ]
    
    data_values = []

    data_values.append(unique_dataset['id'].iloc[0])
    data_values.append(unique_dataset['event_id'].iloc[-1])
    data_values.append(unique_dataset['word_count'].iloc[-1])
    data_values.append(unique_dataset['warning_issued'].sum())
    data_values.append(unique_dataset['cumulative_writing_time'].iloc[-1])
    data_values.append(unique_dataset['time_between_events'].sum())

    data_values.append(unique_dataset['time_between_events'].mean())
    data_values.append(unique_dataset['time_between_events'].sum() / unique_dataset['cumulative_writing_time'].iloc[-1])

    data_values.extend([
        unique_dataset[unique_dataset['activity'] == 'Nonproduction'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Input'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Remove/Cut'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Paste'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Replace'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'MoveSection'].shape[0],
    ])

    data_values.append(unique_dataset[unique_dataset['event_type'] == '.'].shape[0])
    data_values.append(unique_dataset['action_time'].mean())
    data_values.append(unique_dataset['action_time'].median())
    data_values.append(unique_dataset['action_time'].min())
    data_values.append(unique_dataset['action_time'].max())
    data_values.append(unique_dataset['action_time'].std())

    data_values.append(unique_dataset['cursor_move_distance'].mean())
    data_values.append(unique_dataset['word_count_change'].mean())

    data_values.append(
        unique_dataset[(unique_dataset['activity'] == 'Nonproduction') & (unique_dataset['event_type'].str.contains('click'))].shape[0])
    data_values.append(unique_dataset[(unique_dataset['activity'] == 'Nonproduction') & (unique_dataset['event_type'].str.contains('Arrow'))].shape[0])

    data_values.append(unique_dataset['time_between_events'].mean())

    return pd.Series(data_values, index=feature_list)

In [141]:
print('Cleaning Train Dataset!')
cleaned_data = get_clean_data(input_train_dataset, config['redundant_features'], config['feature_rename'])

print('Preprocessing Train Data!')
cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_capitalized_letters)
cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_temporal_features)
cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_cursor_features)
cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_word_change_features)

master_data = cleaned_data.groupby('id').apply(calculate_features).reset_index(drop=True)

master_data['total_writing_time'] = master_data['total_time_taken'] - master_data['total_pause_time']

master_data['proportion_np_events'] = master_data['non_productive_events'] / master_data['total_number_of_events']
master_data['proportion_input_events'] = master_data['input_events'] / master_data['total_number_of_events']
master_data['proportion_delete_events'] = master_data['deletion_events'] / master_data['total_number_of_events']
master_data['proportion_addition_events'] = master_data['addition_events'] / master_data['total_number_of_events']
master_data['proportion_replace_events'] = master_data['replacement_events'] / master_data['total_number_of_events']
master_data['proportion_moving_events'] = master_data['string_move_events'] / master_data['total_number_of_events']

master_data = pd.merge(master_data, y_train, on='id')

print('Preprocessing Complete!')

Cleaning Train Dataset!
Preprocessing Train Data!
Preprocessing Complete!


In [142]:
print('Cleaning Test Dataset!')
cleaned_data_test = get_clean_data(input_test_dataset, config['redundant_features'], config['feature_rename'])

print('Preprocessing Test Data!')
cleaned_data_test = cleaned_data_test.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_capitalized_letters)
cleaned_data_test = cleaned_data_test.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_temporal_features)
cleaned_data_test = cleaned_data_test.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_cursor_features)
cleaned_data_test = cleaned_data_test.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_word_change_features)

master_data_test = cleaned_data_test.groupby('id').apply(calculate_features).reset_index(drop=True)

master_data_test['total_writing_time'] = master_data_test['total_time_taken'] - master_data_test['total_pause_time']

master_data_test['proportion_np_events'] = master_data_test['non_productive_events'] / master_data_test['total_number_of_events']
master_data_test['proportion_input_events'] = master_data_test['input_events'] / master_data_test['total_number_of_events']
master_data_test['proportion_delete_events'] = master_data_test['deletion_events'] / master_data_test['total_number_of_events']
master_data_test['proportion_addition_events'] = master_data_test['addition_events'] / master_data_test['total_number_of_events']
master_data_test['proportion_replace_events'] = master_data_test['replacement_events'] / master_data_test['total_number_of_events']
master_data_test['proportion_moving_events'] = master_data_test['string_move_events'] / master_data_test['total_number_of_events']

print('Preprocessing Complete!')

Cleaning Test Dataset!
Preprocessing Test Data!
Preprocessing Complete!


In [143]:
print('Creating X and y Dataframes!')

master_data = master_data.set_index('id')
master_data_test = master_data_test.set_index('id')

y = master_data['score']
X = master_data.drop(columns=['score'])

scalar = StandardScaler()
transformer = PowerTransformer()

X_train = transformer.fit_transform(scalar.fit_transform(X))
X_test = transformer.transform(scalar.transform(master_data_test))

Creating X and y Dataframes!


In [144]:
class RegressorEnsemble(BaseEstimator, RegressorMixin):

    def __init__(self, model_params: dict, models_list: list = None):
        self.models_list = [
            ('gbr', GradientBoostingRegressor(random_state=0, **model_params['gbr'])),
            ('rfr', RandomForestRegressor(random_state=0)),
            ('lgbm', LGBMRegressor(random_state=0)),
        ] if models_list is None else models_list
        
        self.blending_model = None

    def fit(self, X, y=None):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=0)
        meta_X = list()
        
        for _, model_object in self.models_list:
            model_object.fit(X_train, y_train)
            yhat = model_object.predict(X_val)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        self.blending_model = LinearRegression().fit(np.hstack(meta_X), y_val)
        
        return self
    
    def predict(self, X, y=None):
        meta_X = list()
        
        for _, model_object in self.models_list:
            yhat = model_object.predict(X)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        return self.blending_model.predict(np.hstack(meta_X))

In [145]:
# print('Building Model Object!')

# model_params = {
#     'n_estimators': 938, 'learning_rate': 0.0878442675959586, 
#     'max_depth': 6, 'min_samples_split': 0.13437890663689403, 
#     'min_samples_leaf': 0.10043950591718874, 'subsample': 0.6094719649466178
# }

# reg_model = GradientBoostingRegressor(random_state=0, **model_params).fit(X_train, y)

# y_hat_train = reg_model.predict(X_train)
# y_hat_test = reg_model.predict(X_test)

# submission_data = pd.DataFrame({'id': master_data_test.index, 'score': y_hat_test})

# print(f'R^2 Score: {round(reg_model.score(X_train, y), 3)},', 
#       f'RMSE Score: {round(mean_squared_error(y, y_hat_train, squared=False), 3)},',
#       f'Rounded RMSE Score: {round(rounded_rmse(y, y_hat_train), 3)}')

Building Model Object!
R^2 Score: 0.729, RMSE Score: 0.533, Rounded RMSE Score: 0.549


In [146]:
print('Building Model Object!')

model_params = {
    'n_estimators': 938, 'learning_rate': 0.0878442675959586, 
    'max_depth': 6, 'min_samples_split': 0.13437890663689403, 
    'min_samples_leaf': 0.10043950591718874, 'subsample': 0.6094719649466178
}

reg_model = RegressorEnsemble(model_params={'gbr': model_params}).fit(X_train, y)

y_hat_train = reg_model.predict(X_train)
y_hat_test = reg_model.predict(X_test)

submission_data = pd.DataFrame({'id': master_data_test.index, 'score': y_hat_test})

print(f'R^2 Score: {round(reg_model.score(X_train, y), 3)},', 
      f'RMSE Score: {round(mean_squared_error(y, y_hat_train, squared=False), 3)},',
      f'Rounded RMSE Score: {round(rounded_rmse(y, y_hat_train), 3)}')

Building Model Object!
R^2 Score: 0.781, RMSE Score: 0.479, Rounded RMSE Score: 0.496


In [147]:
submission_data

Unnamed: 0,id,score
0,0000aaaa,1.892196
1,2222bbbb,1.860647
2,4444cccc,1.919023


In [148]:
y_hat_train

array([3.34127029, 3.53916431, 5.23036519, ..., 2.0642793 , 4.51863171,
       3.72763386])

In [149]:
submission_data.to_csv('submission.csv', index=False)