In [2]:
import numpy as np
import pandas as pd

import os
import warnings
import optuna
import re
import string

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression

from lightgbm import LGBMRegressor
from lazypredict.Supervised import LazyRegressor
from typing import Union

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
config = {
    'redundant_features': ['up_event'],
    'feature_rename': {
        'down_event': 'event_type'
    }
}

input_dataset = pd.read_csv('./data/train_logs.csv')
y_train = pd.read_csv('./data/train_scores.csv')

In [4]:
# input_dataset = input_dataset[input_dataset['id'].isin(['001519c8', '0022f953', '0042269b', 'ffccd6fd', 'ffec5b38', 'fff05981'])]
# input_dataset

In [5]:
def get_essay_paragh(dataframe: pd.DataFrame) -> pd.Series:
    textInputDf = dataframe[['id', 'activity', 'cursor_position', 'text_change']].copy()
    textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']

    # Vectorize actions based on activity type
    def apply_actions(group):
        essayText = ""
        for activity, cursor_position, text_change in zip(group['activity'], group['cursor_position'], group['text_change']):
            if activity == 'Replace':
                replaceTxt = text_change.split(' => ')
                essayText = essayText[:cursor_position - len(replaceTxt[1])] + replaceTxt[1] + essayText[cursor_position - len(replaceTxt[1]) + len(replaceTxt[0]):]
            elif activity == 'Paste':
                essayText = essayText[:cursor_position - len(text_change)] + text_change + essayText[cursor_position - len(text_change):]
            elif activity == 'Remove/Cut':
                essayText = essayText[:cursor_position] + essayText[cursor_position + len(text_change):]
            elif "M" in activity:
                move_info = activity[activity.index('[') + 1:activity.index(']')]
                move_from, move_to = [int(val) for val in move_info.split(',')]
                if move_from != move_to:
                    if move_from < move_to:
                        essayText = essayText[:move_from] + essayText[move_to:move_to + len(text_change)] + essayText[move_from:move_to] + essayText[move_to + len(text_change):]
                    else:
                        essayText = essayText[:move_to] + essayText[move_from:move_from + len(text_change)] + essayText[move_to:move_from] + essayText[move_from + len(text_change):]
            else:
                essayText = essayText[:cursor_position - len(text_change)] + text_change + essayText[cursor_position - len(text_change):]
        return essayText

    # Apply actions to each group (id) and convert to Series
    essaySeries = textInputDf.groupby('id').apply(apply_actions).to_frame().rename(columns={0: 'essay'}).squeeze()

    return essaySeries

def q1(x):
    return x.quantile(0.25)
def q3(x):
    return x.quantile(0.75)

def split_and_aggregate_sentences(df):
    AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

    df['sent'] = df['essay'].str.split('\\.|\\?|\\!')
    df = df.explode('sent')
    df['sent'] = df['sent'].str.replace('\n', '').str.strip()
    df['sent_len'] = df['sent'].str.len()
    df['sent_word_count'] = df['sent'].str.split().str.len()

    grouped = df.groupby('id')
    agg_df = pd.concat([
        grouped[['sent_len']].agg(AGGREGATIONS),
        grouped[['sent_word_count']].agg(AGGREGATIONS)
    ], axis=1)

    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns]
    agg_df.index.name = 'id'
    agg_df = agg_df.drop(columns=['sent_word_count_count'])
    agg_df = agg_df.rename(columns={'sent_len_count': 'sent_count'})

    return agg_df

def split_and_aggregate_paragraphs(df):
    AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']

    df['paragraph'] = df['essay'].str.split('\n')
    df = df.explode('paragraph')
    # df = df[df['paragraph'].str.len() > 0] 

    df['paragraph_len'] = df['paragraph'].str.len() 
    df['paragraph_word_count'] = df['paragraph'].str.split().str.len()

    grouped = df.groupby('id')
    agg_df = pd.concat([
        grouped[['paragraph_len']].agg(AGGREGATIONS),
        grouped[['paragraph_word_count']].agg(AGGREGATIONS)
    ], axis=1)

    agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns] 
    agg_df.index.name = 'id' 
    agg_df = agg_df.drop(columns=['paragraph_word_count_count']) 
    agg_df = agg_df.rename(columns={'paragraph_len_count': 'paragraph_count'})

    return agg_df

def get_sentance_level_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    paragh_data = get_essay_paragh(dataframe.copy())
    paragh_data = pd.DataFrame({'id': paragh_data.index, 'essay': paragh_data.values})
    
    sentance_agg = split_and_aggregate_sentences(paragh_data.copy())
    paragh_agg = split_and_aggregate_paragraphs(paragh_data.copy())

    master_data = pd.merge(sentance_agg, paragh_agg, on='id')

    return master_data

def get_activity_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
    unidentified_columns = [
        '\x80', '\x96', '\x97', '\x9b', '¡', '¿', 'Â´', 'Ä±', 'Å\x9f', 'Ë\x86', 'â\x80\x93', 'ä', 'Unidentified', 'Dead', '0', 
        '1', '2', '5', 'AltGraph', 'Cancel', 'Clear', 'Meta', 'ContextMenu', 'ModeChange', 'OS', 'Pause', 'Process']
    function_clicks = ['F1', 'F10', 'F11', 'F12', 'F15', 'F2', 'F3', 'F6']
    mouse_clicks = ['Leftclick', 'Unknownclick', 'Rightclick', 'Middleclick']
    keyboard_clicks = [
    'Alt', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'ArrowUp', 'Backspace', 'CapsLock', 
    'Control','Delete', 'End', 'Enter', 'Escape', 'Home', 'Insert', 'NumLock', 'PageDown', 'PageUp', 
    'ScrollLock', 'Shift', 'Space', 'Tab']
    redundent_activity = [
    'AudioVolumeDown', 'AudioVolumeMute', 'AudioVolumeUp','MediaPlayPause', 'MediaTrackNext', 'MediaTrackPrevious']
    
    dataframe = dataframe.groupby(['id', 'down_event']).size().reset_index(name='count')
    dataframe = dataframe.pivot_table(index='id', columns='down_event', values='count', fill_value=0).reset_index()
    
    punct_columns = dataframe.columns[dataframe.columns.isin(list(string.punctuation))]
    input_columns = dataframe.columns[dataframe.columns.isin(list(string.ascii_lowercase) + list(string.ascii_uppercase))]
    unidnty_columns = dataframe.columns[dataframe.columns.isin(unidentified_columns)]
    func_columns = dataframe.columns[dataframe.columns.isin(function_clicks)]
    mouse_columns = dataframe.columns[dataframe.columns.isin(mouse_clicks)]
    keyboard_columns = dataframe.columns[dataframe.columns.isin(keyboard_clicks)]
    redundant_columns = dataframe.columns[dataframe.columns.isin(redundent_activity)]
    
    dataframe['punctuation'] = dataframe[punct_columns].sum(axis=1)
    dataframe['inputs'] = dataframe[input_columns].sum(axis=1)
    dataframe['unidentified'] = dataframe[unidnty_columns].sum(axis=1)
    dataframe['functions'] = dataframe[func_columns].sum(axis=1)
    dataframe['mouse_clicks'] = dataframe[mouse_columns].sum(axis=1)
    dataframe['keyboard_clicks'] = dataframe[keyboard_columns].sum(axis=1)
    dataframe['redundant'] = dataframe[redundant_columns].sum(axis=1)

    columns_to_drop = list(punct_columns) + list(input_columns) + list(unidnty_columns) + list(func_columns) + list(mouse_columns) + list(keyboard_columns) + list(redundant_columns) 
    
    dataframe = dataframe.drop(columns=columns_to_drop)
    dataframe = dataframe[[
        'id', 'punctuation', 'inputs', 'unidentified', 'functions', 'mouse_clicks', 'keyboard_clicks', 'redundant']]

    return dataframe.reset_index(drop=True)

In [6]:
def get_clean_data(X: pd.DataFrame, feature_list: list, rename_dict: dict) -> pd.DataFrame:
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'down_event'] = 'NoEvent'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Nonproduction'), 'up_event'] = 'NoEvent'
    
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Input'), 'up_event'] = 'q'
    X.loc[(X['up_event'] != X['down_event']) & (X['activity'] == 'Replace'), 'up_event'] = 'q'

    X.loc[X['activity'].str.contains('Move From'), 'activity'] = 'MoveSection'

    X = X.drop(columns=feature_list)
    X = X.rename(columns=rename_dict)

    return X

def rounded_rmse(y, y_pred, **kwargs):
    return mean_squared_error(y, np.round(y_pred * 2) / 2, squared=False)

In [7]:
class FeatureEngineering:

    @staticmethod
    def get_capitalized_letters(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_event_type'] = X['event_type'].shift()
        X['capitalize_letters'] = (X['activity'] == 'Input') & (X['previous_event_type'] == 'Shift') & (X['event_type'] == 'q')
        
        X = X.drop(columns=['previous_event_type'])
        
        return X

    @staticmethod
    def get_temporal_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_up_time'] = X['up_time'].shift().fillna(X['down_time'].iloc[0])
        X['time_between_events'] = X['down_time'] - X['previous_up_time']
        
        X['cumulative_writing_time'] = (X['action_time'] + X['time_between_events']).cumsum()

        X['warning_issued'] = X['time_between_events'] >= 120000
        X = X.drop(columns=['previous_up_time'])
        
        return X

    @staticmethod
    def get_cursor_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_cursor_position'] = X['cursor_position'].shift().fillna(0)
        X['cursor_move_distance'] = X['cursor_position'] - X['previous_cursor_position']
        X['cursor_move_distance'] = X['cursor_move_distance'].abs()

        X = X.drop(columns=['previous_cursor_position'])

        return X

    @staticmethod
    def get_word_change_features(X: pd.DataFrame) -> pd.DataFrame:
        X['previous_word_count'] = X['word_count'].shift().fillna(0)
        X['word_count_change'] = X['word_count'] - X['previous_word_count']
        X['word_count_change'] = X['word_count_change'].abs()

        X = X.drop(columns=['previous_word_count'])

        return X

In [8]:
def calculate_features(unique_dataset):
    feature_list = [
        'id', 'total_number_of_events', 'final_number_of_words', 'number_of_warnings_issued',
        'total_time_taken', 'total_pause_time', 'average_pause_length', 'proportion_pause_time',
        'non_productive_events', 'input_events', 'deletion_events', 'addition_events', 'replacement_events', 'string_move_events',
        'number_of_sentences', 'average_action_time', 'median_action_time', 'min_action_time', 'max_action_time',
        'std_action_time', 'sum_action_time', 'average_cursor_distance', 'max_cursor_distance', 'total_cursor_distance', 'std_cursor_distance', 
        'avg_word_count_btw_events', 'min_time_between_events', 'max_time_between_events', 'std_time_between_events'
    ]
    
    data_values = []

    data_values.append(unique_dataset['id'].iloc[0])
    data_values.append(unique_dataset['event_id'].iloc[-1])
    data_values.append(unique_dataset['word_count'].iloc[-1])
    data_values.append(unique_dataset['warning_issued'].sum())
    data_values.append(unique_dataset['cumulative_writing_time'].iloc[-1])
    data_values.append(unique_dataset['time_between_events'].sum())

    data_values.append(unique_dataset['time_between_events'].mean())
    data_values.append(unique_dataset['time_between_events'].sum() / unique_dataset['cumulative_writing_time'].iloc[-1])

    data_values.extend([
        unique_dataset[unique_dataset['activity'] == 'Nonproduction'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Input'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Remove/Cut'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Paste'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'Replace'].shape[0],
        unique_dataset[unique_dataset['activity'] == 'MoveSection'].shape[0],
    ])

    data_values.append(unique_dataset[unique_dataset['event_type'] == '.'].shape[0])
    data_values.append(unique_dataset['action_time'].mean())
    data_values.append(unique_dataset['action_time'].median())
    data_values.append(unique_dataset['action_time'].min())
    data_values.append(unique_dataset['action_time'].max())
    data_values.append(unique_dataset['action_time'].std())
    data_values.append(unique_dataset['action_time'].sum())

    data_values.append(unique_dataset['cursor_move_distance'].mean())
    data_values.append(unique_dataset['cursor_move_distance'].max())
    data_values.append(unique_dataset['cursor_move_distance'].sum())
    data_values.append(unique_dataset['cursor_move_distance'].std())
    
    data_values.append(unique_dataset['word_count_change'].mean())
    
    data_values.append(unique_dataset['time_between_events'].min())
    data_values.append(unique_dataset['time_between_events'].max())
    data_values.append(unique_dataset['time_between_events'].std())

    return pd.Series(data_values, index=feature_list)

In [9]:
def create_master_data(input_data: pd.DataFrame, config: dict) -> pd.DataFrame:
    print('Cleaning Train Dataset!')
    
    sent_paragh_data = get_sentance_level_data(input_data.copy())
    activity_count_data = get_activity_counts(input_data.copy())
    
    cleaned_data = get_clean_data(input_data, config['redundant_features'], config['feature_rename'])

    print('Preprocessing Train Data!')
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_capitalized_letters)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_temporal_features)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_cursor_features)
    cleaned_data = cleaned_data.groupby('id', group_keys=False, sort=False).apply(FeatureEngineering.get_word_change_features)

    master_data = cleaned_data.groupby('id').apply(calculate_features).reset_index(drop=True)

    master_data = pd.merge(master_data, sent_paragh_data, on='id')
    master_data = pd.merge(master_data, activity_count_data, on='id')

    master_data['total_writing_time'] = master_data['total_time_taken'] - master_data['total_pause_time']

    master_data['proportion_np_events'] = master_data['non_productive_events'] / master_data['total_number_of_events']
    master_data['proportion_input_events'] = master_data['input_events'] / master_data['total_number_of_events']
    master_data['proportion_delete_events'] = master_data['deletion_events'] / master_data['total_number_of_events']
    master_data['proportion_addition_events'] = master_data['addition_events'] / master_data['total_number_of_events']
    master_data['proportion_replace_events'] = master_data['replacement_events'] / master_data['total_number_of_events']
    master_data['proportion_moving_events'] = master_data['string_move_events'] / master_data['total_number_of_events']

    print('Preprocessing Complete!')
    
    return master_data

In [10]:
master_data = create_master_data(input_data=input_dataset, config=config)
master_data = pd.merge(master_data, y_train, on='id')

master_data.to_csv('./data/master_data_v2.csv', index=False)

Cleaning Train Dataset!
Preprocessing Train Data!
Preprocessing Complete!


In [11]:
master_data

Unnamed: 0,id,total_number_of_events,final_number_of_words,number_of_warnings_issued,total_time_taken,total_pause_time,average_pause_length,proportion_pause_time,non_productive_events,input_events,deletion_events,addition_events,replacement_events,string_move_events,number_of_sentences,average_action_time,median_action_time,min_action_time,max_action_time,std_action_time,sum_action_time,average_cursor_distance,max_cursor_distance,total_cursor_distance,std_cursor_distance,avg_word_count_btw_events,min_time_between_events,max_time_between_events,std_time_between_events,sent_count,sent_len_mean,sent_len_min,sent_len_max,sent_len_first,sent_len_last,sent_len_q1,sent_len_median,sent_len_q3,sent_len_sum,sent_word_count_mean,sent_word_count_min,sent_word_count_max,sent_word_count_first,sent_word_count_last,sent_word_count_q1,sent_word_count_median,sent_word_count_q3,sent_word_count_sum,paragraph_count,paragraph_len_mean,paragraph_len_min,paragraph_len_max,paragraph_len_first,paragraph_len_last,paragraph_len_q1,paragraph_len_median,paragraph_len_q3,paragraph_len_sum,paragraph_word_count_mean,paragraph_word_count_min,paragraph_word_count_max,paragraph_word_count_first,paragraph_word_count_last,paragraph_word_count_q1,paragraph_word_count_median,paragraph_word_count_q3,paragraph_word_count_sum,punctuation,inputs,unidentified,functions,mouse_clicks,keyboard_clicks,redundant,total_writing_time,proportion_np_events,proportion_input_events,proportion_delete_events,proportion_addition_events,proportion_replace_events,proportion_moving_events,score
0,001519c8,2557,255,1,1797443.00,1500200.00,586.70,0.83,120,2010,417,0,7,3,21,116.25,112.00,0,2259,91.80,297243,4.16,1350.00,10632.00,43.17,0.17,-142.00,154136.00,4293.20,15,99.07,0,196,31,0,69.00,119.00,125.00,1486,17.13,0,29,6,0,11.00,21.00,22.00,257,5,304.80,0,654,390,480,0.00,390.00,480.00,1524,51.40,0,107,69,81,0.00,69.00,81.00,257,37.00,1619.00,0.00,0.00,92.00,809.00,0.00,297243.00,0.05,0.79,0.16,0.00,0.00,0.00,3.50
1,0022f953,2454,320,1,1758346.00,1482955.00,604.30,0.84,254,1938,260,1,1,0,15,112.22,115.00,0,1758,55.43,275391,9.82,1581.00,24087.00,84.77,0.17,-166.00,145899.00,4896.32,16,100.94,0,226,19,0,45.50,87.00,146.25,1615,20.19,0,45,3,0,9.00,18.50,30.50,323,7,238.43,0,462,240,0,200.50,240.00,283.00,1669,46.29,0,90,52,0,34.50,52.00,56.50,324,53.00,1490.00,0.00,0.00,56.00,855.00,0.00,275391.00,0.10,0.79,0.11,0.00,0.00,0.00,3.50
2,0042269b,4136,404,1,1767228.00,1346027.00,325.44,0.76,175,3515,439,0,7,0,21,101.84,94.00,0,3005,82.38,421201,6.53,1862.00,27007.00,71.78,0.17,-250.00,153886.00,3936.89,20,127.15,0,189,139,0,103.75,135.50,161.00,2543,20.40,0,29,21,0,17.00,21.00,26.25,408,11,234.27,0,568,491,296,0.00,296.00,444.50,2577,37.18,0,88,79,45,0.00,45.00,73.50,409,49.00,2904.00,0.00,0.00,130.00,1053.00,0.00,421201.00,0.04,0.85,0.11,0.00,0.00,0.00,6.00
3,0059420b,1556,206,0,1363074.00,1173478.00,754.16,0.86,99,1304,151,1,1,0,13,121.85,110.00,0,806,113.77,189596,1.46,357.00,2267.00,9.92,0.18,-516.00,101690.00,4240.83,14,80.64,0,144,99,0,56.00,80.00,99.00,1129,14.86,0,27,17,0,11.00,14.50,17.75,208,3,384.00,347,449,347,356,351.50,356.00,402.50,1152,69.00,61,81,61,65,63.00,65.00,73.00,207,21.00,1044.00,0.00,0.00,19.00,472.00,0.00,189596.00,0.06,0.84,0.10,0.00,0.00,0.00,2.00
4,0075873a,2531,252,0,1584002.00,1270300.00,501.90,0.80,72,1942,517,0,0,0,23,123.94,129.00,0,701,62.08,313702,2.80,643.00,7094.00,24.25,0.17,-158.00,110688.00,3895.45,17,81.71,0,182,75,0,60.00,73.00,106.00,1389,15.00,0,35,11,0,11.00,12.00,18.00,255,9,157.44,0,627,351,23,0.00,23.00,292.00,1417,28.33,0,114,61,3,0.00,3.00,52.00,255,67.00,1541.00,0.00,0.00,33.00,890.00,0.00,313702.00,0.03,0.77,0.20,0.00,0.00,0.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2466,ffb8c745,4739,273,1,1769182.00,1269512.00,267.89,0.72,189,3588,960,0,2,0,43,105.44,113.00,0,3323,63.62,499670,1.99,1262.00,9432.00,27.21,0.20,-117.00,128570.00,3458.46,14,112.43,0,180,79,0,83.25,120.50,143.75,1574,19.50,0,33,16,0,15.25,22.50,24.00,273,4,407.75,301,514,372,301,354.25,408.00,461.50,1631,68.25,54,78,64,54,61.50,70.50,77.25,273,88.00,2844.00,0.00,0.00,24.00,1783.00,0.00,499670.00,0.04,0.76,0.20,0.00,0.00,0.00,3.50
2467,ffbef7e5,2604,438,1,1777442.00,1563221.00,600.32,0.88,148,2395,60,0,1,0,31,82.27,80.00,0,1144,36.18,214221,2.82,1124.00,7341.00,35.61,0.18,-64.00,267869.00,5629.56,30,75.70,0,175,143,0,49.00,66.00,103.00,2271,14.77,0,33,27,0,9.25,13.00,19.00,443,13,178.69,0,648,144,0,0.00,0.00,412.00,2323,34.08,0,119,27,0,0.00,0.00,80.00,443,63.00,1874.00,0.00,0.00,38.00,629.00,0.00,214221.00,0.06,0.92,0.02,0.00,0.00,0.00,4.00
2468,ffccd6fd,3063,201,2,1935881.00,1704301.00,556.42,0.88,126,2849,88,0,0,0,5,75.61,70.00,0,564,63.49,231580,1.34,427.00,4117.00,8.51,0.07,-87.00,229804.00,5397.25,5,221.60,0,359,223,0,200.00,223.00,326.00,1108,40.60,0,61,42,0,41.00,42.00,59.00,203,7,393.57,0,2002,426,2002,0.00,0.00,376.50,2755,29.00,0,83,83,61,0.00,0.00,60.00,203,7.00,969.00,0.00,0.00,9.00,2078.00,0.00,231580.00,0.04,0.93,0.03,0.00,0.00,0.00,1.50
2469,ffec5b38,3242,413,1,1488619.00,1199180.00,369.89,0.81,71,2895,276,0,0,0,31,89.28,85.00,0,1388,54.52,289439,1.81,563.00,5881.00,16.15,0.16,-132.00,127733.00,3461.54,28,89.29,0,176,79,0,61.00,96.00,105.25,2500,14.93,0,29,11,0,10.75,14.50,18.25,418,5,509.60,380,672,672,380,394.00,540.00,562.00,2548,83.40,62,111,111,62,66.00,85.00,93.00,417,71.00,2361.00,0.00,0.00,14.00,796.00,0.00,289439.00,0.02,0.89,0.09,0.00,0.00,0.00,5.00


In [None]:
print('Creating X and y Dataframes!')

master_data = pd.read_csv('./data/master_data_v2.csv')
master_data = master_data.set_index('id')

y = master_data['score']
X = master_data.drop(columns=['score'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

scalar = StandardScaler()
transformer = PowerTransformer()

X_train = transformer.fit_transform(scalar.fit_transform(X_train))
X_test = transformer.transform(scalar.transform(X_test))

In [10]:
class RegressorEnsemble(BaseEstimator, RegressorMixin):

    def __init__(self, model_params: dict, models_list: list = None):
        self.models_list = [
            ('gbr', GradientBoostingRegressor(random_state=0, **model_params['gbr'])),
            ('rfr', RandomForestRegressor(random_state=0, **model_params['rfr'])),
            ('lgbm', LGBMRegressor(
                metric='rmse', objective='regression', random_state=0, **model_params['lgbm'])),
        ] if models_list is None else models_list
        
        self.blending_model = None

    def fit(self, X, y=None):
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=0)
        meta_X = list()
        
        for _, model_object in self.models_list:
            model_object.fit(X_train, y_train)
            yhat = model_object.predict(X_val)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        self.blending_model = LinearRegression().fit(np.hstack(meta_X), y_val)
        
        return self
    
    def predict(self, X, y=None):
        meta_X = list()
        
        for _, model_object in self.models_list:
            yhat = model_object.predict(X)
            
            yhat = yhat.reshape(len(yhat), 1)
            meta_X.append(yhat)
            
        return self.blending_model.predict(np.hstack(meta_X))

In [11]:
class OptunaTuning:
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
        self.scalar_dict = {'StandardScaler': StandardScaler, 'RobustScaler': RobustScaler, 'MinMaxScaler': MinMaxScaler}

    def get_model_params(self, trial):
        return {
            'gbr': {                
                'n_estimators': trial.suggest_int('gbr_n_estimators', 50, 1000),
                'learning_rate': trial.suggest_float('gbr_learning_rate', 0.01, 0.1),
                'max_depth': trial.suggest_int('gbr_max_depth', 3, 10),
                
                'min_samples_split': trial.suggest_float('gbr_min_samples_split', 0.1, 1.0),
                'min_samples_leaf': trial.suggest_float('gbr_min_samples_leaf', 0.1, 0.5),
                'subsample': trial.suggest_float('gbr_subsample', 0.5, 1.0)
            },
            
            'rfr': {                
                'n_estimators': trial.suggest_int('rfr_n_estimators', 10, 1000),
                'max_depth': trial.suggest_int('rfr_max_depth', 1, 32),
                'min_samples_split': trial.suggest_float('rfr_min_samples_split', 0.1, 1.0),
                'min_samples_leaf': trial.suggest_float('rfr_min_samples_leaf', 0.1, 0.5)
            },
            
            'lgbm': {                
                'boosting_type': trial.suggest_categorical('lgbm_boosting_type', ['gbdt', 'dart']),
                'n_estimators': trial.suggest_int('lgbm_n_estimators', 50, 1000),
                'learning_rate': trial.suggest_float('lgbm_learning_rate', 0.01, 0.1),
                
                'num_leaves': trial.suggest_int('lgbm_num_leaves', 10, 100),
                'max_depth': trial.suggest_int('lgbm_max_depth', 3, 15),
                'min_child_samples': trial.suggest_int('lgbm_min_child_samples', 5, 50),
                
                'subsample': trial.suggest_float('lgbm_subsample', 0.5, 1.0),
                'colsample_bytree': trial.suggest_float('lgbm_colsample_bytree', 0.5, 1.0),
                
                'reg_alpha': trial.suggest_float('lgbm_reg_alpha', 0.0, 1.0),
                'reg_lambda': trial.suggest_float('lgbm_reg_lambda', 0.0, 1.0)
            }
        }

    def objective(self, trial):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X, self.y, test_size=0.20, random_state=0, stratify=y, shuffle=True)
        
        scalar_object = self.scalar_dict[trial.suggest_categorical('scalar_object', list(self.scalar_dict.keys()))]()
        transformer_object = PowerTransformer()

        X_train = scalar_object.fit_transform(X_train)
        X_train = transformer_object.fit_transform(X_train)

        X_test = scalar_object.transform(X_test)
        X_test = transformer_object.transform(X_test)
        
        model_params = self.get_model_params(trial)
        regressor_model = RegressorEnsemble(model_params=model_params).fit(X_train, y_train)

        y_pred = np.round(regressor_model.predict(X_test), 3)
        
        return round(mean_squared_error(y_test, y_pred, squared=False), 4)

In [12]:
def remove_corr_features(dataframe: pd.DataFrame) -> list:
    corr_matrix = dataframe.corr().abs()
    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    
    return to_drop

master_data = pd.read_csv('./data/master_data_v2.csv')
master_data = master_data.set_index('id')

y = master_data['score']
X = master_data.drop(columns=['score'])

drop_columns = remove_corr_features(X)
X = X.drop(columns=drop_columns)

print(f'No. of Features: {X.shape[1]}')
print(f'Feature List: {X.columns}')

tuning_object = OptunaTuning(X, y)
    
study = optuna.create_study(direction='minimize')
study.optimize(tuning_object.objective, n_trials=500, n_jobs=-1)

[I 2024-01-09 18:30:10,735] A new study created in memory with name: no-name-bdcddbda-e78b-409c-bb01-f9f91871b274


No. of Features: 56
Feature List: Index(['total_number_of_events', 'final_number_of_words',
       'average_pause_length', 'proportion_pause_time',
       'non_productive_events', 'input_events', 'deletion_events',
       'addition_events', 'replacement_events', 'string_move_events',
       'number_of_sentences', 'average_action_time', 'median_action_time',
       'min_action_time', 'max_action_time', 'sum_action_time',
       'average_cursor_distance', 'max_cursor_distance',
       'total_cursor_distance', 'std_cursor_distance',
       'avg_word_count_btw_events', 'min_time_between_events',
       'max_time_between_events', 'std_time_between_events', 'sent_count',
       'sent_len_mean', 'sent_len_min', 'sent_len_max', 'sent_len_first',
       'sent_len_last', 'sent_len_q1', 'sent_len_median', 'paragraph_count',
       'paragraph_len_mean', 'paragraph_len_min', 'paragraph_len_max',
       'paragraph_len_first', 'paragraph_len_last', 'paragraph_len_q1',
       'paragraph_len_median', '

[I 2024-01-09 18:30:20,101] Trial 10 finished with value: 0.6576 and parameters: {'scalar_object': 'RobustScaler', 'gbr_n_estimators': 432, 'gbr_learning_rate': 0.05793560401948634, 'gbr_max_depth': 10, 'gbr_min_samples_split': 0.6330091181178303, 'gbr_min_samples_leaf': 0.11896496756587559, 'gbr_subsample': 0.5204582374510369, 'rfr_n_estimators': 761, 'rfr_max_depth': 4, 'rfr_min_samples_split': 0.9358733076564069, 'rfr_min_samples_leaf': 0.4358419357581127, 'lgbm_boosting_type': 'dart', 'lgbm_n_estimators': 199, 'lgbm_learning_rate': 0.08642162317338865, 'lgbm_num_leaves': 85, 'lgbm_max_depth': 15, 'lgbm_min_child_samples': 26, 'lgbm_subsample': 0.8111718696567025, 'lgbm_colsample_bytree': 0.5221016650931041, 'lgbm_reg_alpha': 0.9536554198880397, 'lgbm_reg_lambda': 0.95785493851586}. Best is trial 10 with value: 0.6576.
[I 2024-01-09 18:30:21,515] Trial 3 finished with value: 0.6866 and parameters: {'scalar_object': 'RobustScaler', 'gbr_n_estimators': 814, 'gbr_learning_rate': 0.0417

In [13]:
study.best_params

{'scalar_object': 'StandardScaler',
 'gbr_n_estimators': 409,
 'gbr_learning_rate': 0.07361180235738161,
 'gbr_max_depth': 10,
 'gbr_min_samples_split': 0.6482497101518737,
 'gbr_min_samples_leaf': 0.31972085330435496,
 'gbr_subsample': 0.9055662628017166,
 'rfr_n_estimators': 425,
 'rfr_max_depth': 9,
 'rfr_min_samples_split': 0.4596197102513105,
 'rfr_min_samples_leaf': 0.22695246564074448,
 'lgbm_boosting_type': 'dart',
 'lgbm_n_estimators': 115,
 'lgbm_learning_rate': 0.0221079754217087,
 'lgbm_num_leaves': 96,
 'lgbm_max_depth': 12,
 'lgbm_min_child_samples': 7,
 'lgbm_subsample': 0.9063815116324397,
 'lgbm_colsample_bytree': 0.5191190249507341,
 'lgbm_reg_alpha': 0.7733601728487565,
 'lgbm_reg_lambda': 0.745926710711842}

In [14]:
gbr, rfr, lgbm = {}, {}, {}

for key, value in study.best_params.items():

    if key[:3] == 'gbr':
        gbr[key[4:]] = value
    if key[:3] == 'rfr':
        rfr[key[4:]] = value
    if key[:4] == 'lgbm':
        lgbm[key[5:]] = value

{'gbr': gbr, 'rfr': rfr, 'lgbm': lgbm}

{'gbr': {'n_estimators': 409,
  'learning_rate': 0.07361180235738161,
  'max_depth': 10,
  'min_samples_split': 0.6482497101518737,
  'min_samples_leaf': 0.31972085330435496,
  'subsample': 0.9055662628017166},
 'rfr': {'n_estimators': 425,
  'max_depth': 9,
  'min_samples_split': 0.4596197102513105,
  'min_samples_leaf': 0.22695246564074448},
 'lgbm': {'boosting_type': 'dart',
  'n_estimators': 115,
  'learning_rate': 0.0221079754217087,
  'num_leaves': 96,
  'max_depth': 12,
  'min_child_samples': 7,
  'subsample': 0.9063815116324397,
  'colsample_bytree': 0.5191190249507341,
  'reg_alpha': 0.7733601728487565,
  'reg_lambda': 0.745926710711842}}

In [6]:
X_train

Unnamed: 0_level_0,total_number_of_events,final_number_of_words,number_of_warnings_issued,total_time_taken,total_pause_time,average_pause_length,proportion_pause_time,non_productive_events,input_events,deletion_events,addition_events,replacement_events,string_move_events,number_of_sentences,average_action_time,median_action_time,min_action_time,max_action_time,sum_action_time,average_cursor_distance,max_cursor_distance,total_cursor_distance,std_cursor_distance,avg_word_count_btw_events,min_time_between_events,max_time_between_events,std_time_between_events,sent_count,sent_len_mean,sent_len_min,sent_len_max,sent_len_first,sent_len_last,sent_len_q1,sent_len_median,paragraph_count,paragraph_len_mean,paragraph_len_min,paragraph_len_max,paragraph_len_first,paragraph_len_last,paragraph_len_q1,paragraph_len_median,paragraph_len_q3,punctuation,unidentified,functions,mouse_clicks,keyboard_clicks,redundant,proportion_np_events,proportion_input_events,proportion_delete_events,proportion_addition_events,proportion_replace_events,proportion_moving_events
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1
4854cde9,4533,558,0,1777357.00,1485482.00,327.70,0.84,101,3772,660,0,0,0,35,64.39,60.00,0,492,291875,1.07,106.00,4872.00,2.79,0.17,-62.00,55792.00,1661.41,37,82.03,0,235,97,0,58.00,73.00,11,282.00,0,756,503,169,0.00,169.00,542.00,106.00,0.00,0.00,10.00,1414.00,0.00,0.02,0.83,0.15,0.00,0.00,0.00
5e25026d,2617,231,0,1811246.00,1534282.00,586.28,0.85,177,1869,571,0,0,0,17,105.83,96.00,0,1511,276964,1.23,256.00,3212.00,7.33,0.20,-357.00,51864.00,2483.28,17,69.53,0,193,91,0,45.00,63.00,14,91.79,0,546,50,1,0.00,3.00,116.00,27.00,0.00,0.00,34.00,1210.00,0.00,0.07,0.71,0.22,0.00,0.00,0.00
df74824f,3672,467,0,1832587.00,1609345.00,438.27,0.88,80,3218,370,3,1,0,28,60.80,54.00,0,4978,223242,3.30,1416.00,12127.00,38.19,0.18,-42.00,68473.00,2236.85,26,106.27,0,271,61,0,69.25,80.50,19,147.95,0,585,249,221,0.00,180.00,235.00,62.00,0.00,0.00,50.00,955.00,0.00,0.02,0.88,0.10,0.00,0.00,0.00
fe5f3bdb,3765,281,1,1686428.00,1182561.00,314.09,0.70,70,2660,1035,0,0,0,27,133.83,147.00,0,508,503867,1.27,268.00,4780.00,7.24,0.17,-309.00,121784.00,2757.18,22,71.36,0,156,69,0,50.25,63.00,4,405.50,267,508,344,267,324.75,423.50,504.25,53.00,0.00,0.00,17.00,1558.00,0.00,0.02,0.71,0.27,0.00,0.00,0.00
40145912,2400,353,2,1784190.00,1629391.00,678.91,0.91,100,2177,122,0,1,0,19,64.50,60.00,8,2328,154799,1.56,360.00,3734.00,10.92,0.17,-55.00,164604.00,4796.42,19,103.32,0,199,101,0,78.00,100.00,3,668.00,359,1172,359,1172,416.00,473.00,822.50,25.00,0.00,0.00,57.00,565.00,0.00,0.04,0.91,0.05,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213d79d9,1073,201,1,1656005.00,1444395.00,1346.13,0.87,22,1027,24,0,0,0,22,197.21,198.00,1,500,211610,2.14,612.00,2291.00,23.12,0.20,-128.00,1213150.00,37042.70,23,41.22,8,106,49,8,27.00,38.00,8,124.50,0,393,139,393,24.00,91.50,166.50,34.00,0.00,0.00,8.00,253.00,0.00,0.02,0.96,0.02,0.00,0.00,0.00
60d574e2,3727,482,1,1769552.00,1458570.00,391.35,0.82,427,3055,245,0,0,0,30,83.44,79.00,0,8233,310982,3.46,2206.00,12912.00,51.77,0.15,-69.00,156628.00,2967.62,26,104.54,0,238,119,0,62.00,94.00,12,233.25,0,847,345,8,0.00,4.00,396.75,60.00,0.00,0.00,45.00,1201.00,0.00,0.11,0.82,0.07,0.00,0.00,0.00
54687e6e,5223,754,0,1770306.00,1284057.00,245.85,0.73,414,4453,356,0,0,0,53,93.10,92.00,0,520,486249,1.31,814.00,6865.00,13.46,0.17,-202.00,38174.00,1020.27,66,60.06,0,162,67,0,30.25,56.00,11,371.55,119,594,265,380,290.50,380.00,444.00,143.00,0.00,0.00,13.00,1593.00,0.00,0.08,0.85,0.07,0.00,0.00,0.00
8e884143,3549,408,1,1685103.00,1363778.00,384.27,0.81,116,2929,504,0,0,0,26,90.54,84.00,0,4049,321325,4.20,1943.00,14901.00,49.44,0.18,-140.00,125549.00,3083.03,23,95.87,0,281,34,0,52.50,79.00,7,320.71,0,658,658,544,0.00,515.00,536.00,82.00,0.00,0.00,64.00,1075.00,0.00,0.03,0.83,0.14,0.00,0.00,0.00


In [None]:
class Utility:

    @staticmethod
    def q1(x: pd.Series) -> Union[int, float]:
        return x.quantile(0.25)

    @staticmethod
    def q3(x: pd.Series) -> Union[int, float]:
        return x.quantile(0.75)

In [None]:
class EssayLevelAggregations:

    @staticmethod
    def get_essay_paragraph(dataframe: pd.DataFrame) -> pd.Series:
        textInputDf = dataframe[['id', 'activity', 'cursor_position', 'text_change']].copy()
        textInputDf = textInputDf[textInputDf.activity != 'Nonproduction']
    
        def apply_actions(group: pd.DataFrame) -> str:
            essayText = str()
            
            for activity, cursor_position, text_change in zip(group['activity'], group['cursor_position'], group['text_change']):
                if activity == 'Replace':
                    replaceTxt = text_change.split(' => ')
                    essayText = essayText[:cursor_position - len(replaceTxt[1])] + \
                                replaceTxt[1] + \
                                essayText[cursor_position - len(replaceTxt[1]) + len(replaceTxt[0]):]
                
                elif activity == 'Paste':
                    essayText = essayText[:cursor_position - len(text_change)] + \
                                text_change + \
                                essayText[cursor_position - len(text_change):]
                
                elif activity == 'Remove/Cut':
                    essayText = essayText[:cursor_position] + \
                                essayText[cursor_position + len(text_change):]
                
                elif "M" in activity:
                    move_info = activity[activity.index('[') + 1:activity.index(']')]
                    move_from, move_to = [int(val) for val in move_info.split(',')]
                    
                    if move_from != move_to:
                        if move_from < move_to:
                            essayText = essayText[:move_from] + \
                                        essayText[move_to:move_to + len(text_change)] + \
                                        essayText[move_from:move_to] + \
                                        essayText[move_to + len(text_change):]
                        
                        else:
                            essayText = essayText[:move_to] + \
                                        essayText[move_from:move_from + len(text_change)] + \
                                        essayText[move_to:move_from] + \
                                        essayText[move_from + len(text_change):]
                else:
                    essayText = essayText[:cursor_position - len(text_change)] + \
                                text_change + \
                                essayText[cursor_position - len(text_change):]
            
            return essayText
    
        essaySeries = textInputDf.groupby('id').apply(apply_actions).to_frame().rename(columns={0: 'essay'}).squeeze()
    
        return essaySeries

    @staticmethod
    def split_and_aggregate_sentences(dataframe: pd.DataFrame) -> pd.DataFrame:
        AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', Utility.q1, 'median', Utility.q3, 'sum']
    
        dataframe['sent'] = dataframe['essay'].str.split('\\.|\\?|\\!')
        dataframe = dataframe.explode('sent')
        dataframe['sent'] = dataframe['sent'].str.replace('\n', '').str.strip()
        dataframe['sent_len'] = dataframe['sent'].str.len()
        dataframe['sent_word_count'] = dataframe['sent'].str.split().str.len()
    
        agg_df = pd.concat([
            dataframe.groupby('id')[['sent_len']].agg(AGGREGATIONS),
            dataframe.groupby('id')[['sent_word_count']].agg(AGGREGATIONS)
        ], axis=1)
    
        agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns]
        agg_df.index.name = 'id'
    
        return agg_df.drop(columns=['sent_word_count_count']).rename(columns={
            'sent_len_count': 'sent_count'})

    @staticmethod
    def split_and_aggregate_paragraphs(dataframe: pd.DataFrame) -> pd.DataFrame:
        AGGREGATIONS = ['count', 'mean', 'min', 'max', 'first', 'last', q1, 'median', q3, 'sum']
    
        dataframe['paragraph'] = dataframe['essay'].str.split('\n')
        dataframe = dataframe.explode('paragraph')
    
        dataframe['paragraph_len'] = dataframe['paragraph'].str.len() 
        dataframe['paragraph_word_count'] = dataframe['paragraph'].str.split().str.len()
    
        agg_df = pd.concat([
            dataframe.groupby('id')[['paragraph_len']].agg(AGGREGATIONS),
            dataframe.groupby('id')[['paragraph_word_count']].agg(AGGREGATIONS)
        ], axis=1)
    
        agg_df.columns = ['_'.join(col).strip() for col in agg_df.columns] 
        agg_df.index.name = 'id'
    
        return agg_df.drop(columns=['paragraph_word_count_count']).rename(columns={
            'paragraph_len_count': 'paragraph_count'})

    @staticmethod
    def compile_data(dataframe: pd.DataFrame) -> pd.DataFrame:
        dataframe = EssayLevelAggregations.get_essay_paragraph(dataframe=dataframe)
        dataframe = pd.DataFrame({'id': dataframe.index, 'essay': dataframe.values})
        
        sentance_data = EssayLevelAggregations.split_and_aggregate_sentences(dataframe=dataframe.copy())
        paragraph_data = EssayLevelAggregations.split_and_aggregate_paragraphs(dataframe=dataframe.copy())

        return pd.merge(sentance_data, paragraph_data, on='id')

In [None]:
input_dataset

In [None]:
EssayLevelAggregations.compile_data(dataframe=input_dataset.copy())

In [None]:




def get_sentance_level_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    paragh_data = get_essay_paragh(dataframe.copy())
    paragh_data = pd.DataFrame({'id': paragh_data.index, 'essay': paragh_data.values})
    
    sentance_agg = split_and_aggregate_sentences(paragh_data.copy())
    paragh_agg = split_and_aggregate_paragraphs(paragh_data.copy())

    master_data = pd.merge(sentance_agg, paragh_agg, on='id')

    return master_data

def get_activity_counts(dataframe: pd.DataFrame) -> pd.DataFrame:
    unidentified_columns = [
        '\x80', '\x96', '\x97', '\x9b', '¡', '¿', 'Â´', 'Ä±', 'Å\x9f', 'Ë\x86', 'â\x80\x93', 'ä', 'Unidentified', 'Dead', '0', 
        '1', '2', '5', 'AltGraph', 'Cancel', 'Clear', 'Meta', 'ContextMenu', 'ModeChange', 'OS', 'Pause', 'Process']
    function_clicks = ['F1', 'F10', 'F11', 'F12', 'F15', 'F2', 'F3', 'F6']
    mouse_clicks = ['Leftclick', 'Unknownclick', 'Rightclick', 'Middleclick']
    keyboard_clicks = [
    'Alt', 'ArrowDown', 'ArrowLeft', 'ArrowRight', 'ArrowUp', 'Backspace', 'CapsLock', 
    'Control','Delete', 'End', 'Enter', 'Escape', 'Home', 'Insert', 'NumLock', 'PageDown', 'PageUp', 
    'ScrollLock', 'Shift', 'Space', 'Tab']
    redundent_activity = [
    'AudioVolumeDown', 'AudioVolumeMute', 'AudioVolumeUp','MediaPlayPause', 'MediaTrackNext', 'MediaTrackPrevious']
    
    dataframe = dataframe.groupby(['id', 'down_event']).size().reset_index(name='count')
    dataframe = dataframe.pivot_table(index='id', columns='down_event', values='count', fill_value=0).reset_index()
    
    punct_columns = dataframe.columns[dataframe.columns.isin(list(string.punctuation))]
    input_columns = dataframe.columns[dataframe.columns.isin(list(string.ascii_lowercase) + list(string.ascii_uppercase))]
    unidnty_columns = dataframe.columns[dataframe.columns.isin(unidentified_columns)]
    func_columns = dataframe.columns[dataframe.columns.isin(function_clicks)]
    mouse_columns = dataframe.columns[dataframe.columns.isin(mouse_clicks)]
    keyboard_columns = dataframe.columns[dataframe.columns.isin(keyboard_clicks)]
    redundant_columns = dataframe.columns[dataframe.columns.isin(redundent_activity)]
    
    dataframe['punctuation'] = dataframe[punct_columns].sum(axis=1)
    dataframe['inputs'] = dataframe[input_columns].sum(axis=1)
    dataframe['unidentified'] = dataframe[unidnty_columns].sum(axis=1)
    dataframe['functions'] = dataframe[func_columns].sum(axis=1)
    dataframe['mouse_clicks'] = dataframe[mouse_columns].sum(axis=1)
    dataframe['keyboard_clicks'] = dataframe[keyboard_columns].sum(axis=1)
    dataframe['redundant'] = dataframe[redundant_columns].sum(axis=1)

    columns_to_drop = list(punct_columns) + list(input_columns) + list(unidnty_columns) + list(func_columns) + list(mouse_columns) + list(keyboard_columns) + list(redundant_columns) 
    
    dataframe = dataframe.drop(columns=columns_to_drop)

    return dataframe.reset_index(drop=True)