Linking Writing Process to Writing Quality - RandomForest

by KimSeungHee

# 1. import libraries and read data

In [44]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [45]:
train_logs = pd.read_csv('./data/train_logs.csv')
train_scores = pd.read_csv('./data/train_scores.csv')
test_logs = pd.read_csv('./data/test_logs.csv')

# 2. Feature Engineering by KimSeungHee

In [46]:
class FeatureEngineering:
    def __init__(self, data):
        self. data = data
        
    def calculate_max(self, column):
        column_max = self.data.groupby('id')[column].max()
        column_name = f"{column}_max"
        self.data = self.data.merge(column_max.rename(column_name), on='id', how='left')

    def calculate_mean(self, column):
        column_mean = self.data.groupby('id')[column].mean()
        column_name = f"{column}_mean"
        self.data = self.data.merge(column_mean.rename(column_name), on='id', how='left') 
        
    def calculate_sum(self, column):
        column_sum = self.data.groupby('id')[column].sum()
        column_name = f"{column}_sum"
        self.data = self.data.merge(column_sum.rename(column_name), on='id', how='left') 
            
    
    def calculate_percentage(self, column, value):
        counts = self.data[self.data[column] == value].groupby('id').size()
        total_counts = self.data.groupby('id').size()
        percentages = (counts / total_counts).fillna(0)
        replaced_value = value.replace('/', '_').replace('.','punctuation').replace(',','comma').replace('\'','Apostrophe').replace('\"','quote')
        column_name = f"{column}_percentage_of_{replaced_value}"
        self.data = self.data.merge(percentages.rename(column_name), on='id', how='left')  
        
    def calculate_diff(self, column, diff_threshold=30000):
        self.data[f'{column}_diff'] = self.data.groupby('id')[column].diff().fillna(0)
        break_counts = self.data[self.data[f'{column}_diff'] >= diff_threshold].groupby('id').size().reset_index()
        column_name = f"{column}_diff_count"
        break_counts.columns = ['id', column_name]
        self.data = self.data.merge(break_counts, on='id', how='left')
        self.data[column_name] = self.data[column_name].fillna(0)
            
        
        
    def run(self):
        self.calculate_max('event_id')
        self.calculate_mean('event_id')
        self.calculate_mean('action_time')
        self.calculate_sum('action_time')
        
        self.calculate_diff('down_time')
        self.calculate_max('down_time_diff')
        self.calculate_mean('down_time_diff')
        
        self.calculate_percentage('activity', 'Input')
        self.calculate_percentage('activity', 'Remove/Cut')
        self.calculate_percentage('activity', 'Nonproduction')
        self.calculate_percentage('activity', 'Replace')
        self.calculate_percentage('activity', 'Paste')
                
        self.calculate_percentage('down_event', 'q')
        self.calculate_percentage('down_event', 'Space')
        self.calculate_percentage('down_event', 'Backspace')
        self.calculate_percentage('down_event', 'Shift')
        self.calculate_percentage('down_event', 'ArrowRight')
        self.calculate_percentage('down_event', 'Leftclick')
        self.calculate_percentage('down_event', '.')
        self.calculate_percentage('down_event', ',')
        self.calculate_percentage('down_event', 'Enter')
        self.calculate_percentage('down_event', "'")    
        self.calculate_percentage('down_event', '"')
        self.calculate_percentage('down_event', 'Tab')
        
        self.calculate_max('word_count')
        self.calculate_mean('word_count')       
        self.calculate_sum('word_count')  
            
    def get_data(self):
            return self.data      
        

In [47]:
fe = FeatureEngineering(train_logs)
fe.run()
train_logs = fe.get_data()

In [48]:
train_logs

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,...,down_event_percentage_of_Leftclick,down_event_percentage_of_punctuation,down_event_percentage_of_comma,down_event_percentage_of_Enter,down_event_percentage_of_Apostrophe,down_event_percentage_of_quote,down_event_percentage_of_Tab,word_count_max,word_count_mean,word_count_sum
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,...,0.035980,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,...,0.035980,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,...,0.035980,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593
3,001519c8,4,106686,106777,91,Input,q,q,q,1,...,0.035980,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593
4,001519c8,5,107196,107323,127,Input,q,q,q,2,...,0.035980,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,...,0.012158,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,...,0.012158,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,...,0.012158,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,...,0.012158,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376


In [49]:
# score과 merge한다.
train_data = pd.merge(train_logs, train_scores, on='id')

In [50]:
train_data

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,...,down_event_percentage_of_punctuation,down_event_percentage_of_comma,down_event_percentage_of_Enter,down_event_percentage_of_Apostrophe,down_event_percentage_of_quote,down_event_percentage_of_Tab,word_count_max,word_count_mean,word_count_sum,score
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,...,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593,3.5
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,...,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593,3.5
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,...,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593,3.5
3,001519c8,4,106686,106777,91,Input,q,q,q,1,...,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593,3.5
4,001519c8,5,107196,107323,127,Input,q,q,q,2,...,0.008213,0.004693,0.001564,0.001173,0.0,0.0,256,128.116152,327593,3.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8405893,fff05981,3615,2063944,2064440,496,Nonproduction,Leftclick,Leftclick,NoChange,1031,...,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376,4.0
8405894,fff05981,3616,2064497,2064497,0,Nonproduction,Shift,Shift,NoChange,1031,...,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376,4.0
8405895,fff05981,3617,2064657,2064765,108,Replace,q,q,q => q,1031,...,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376,4.0
8405896,fff05981,3618,2069186,2069259,73,Nonproduction,Leftclick,Leftclick,NoChange,1028,...,0.004145,0.005250,0.005803,0.001105,0.0,0.0,241,105.105278,380376,4.0


In [51]:
# id끼리 묶고, 필요없는 열을 제거한다.

drop_columns = [
    'event_id', 'down_time', 'up_time', 
    'action_time', 'activity', 'down_event', 
    'up_event', 'text_change', 'cursor_position', 
    'word_count'
    ]

train_data = train_data.drop(columns=drop_columns).drop_duplicates(subset='id').reset_index(drop=True)

In [52]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2471 entries, 0 to 2470
Data columns (total 30 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id                                    2471 non-null   object 
 1   event_id_max                          2471 non-null   int64  
 2   event_id_mean                         2471 non-null   float64
 3   action_time_mean                      2471 non-null   float64
 4   action_time_sum                       2471 non-null   int64  
 5   down_time_diff                        2471 non-null   float64
 6   down_time_diff_count                  2471 non-null   float64
 7   down_time_diff_max                    2471 non-null   float64
 8   down_time_diff_mean                   2471 non-null   float64
 9   activity_percentage_of_Input          2471 non-null   float64
 10  activity_percentage_of_Remove_Cut     2471 non-null   float64
 11  activity_percenta

# 3. Split Data - Training and Validation

In [53]:
train_set, val_set = train_test_split(train_data, test_size=0.2, random_state=42)

In [54]:
X_train = train_set.drop(['id', 'score'], axis=1)
y_train = train_set['score']
X_val = val_set.drop(['id', 'score'], axis=1)
y_val = val_set['score']

# 4. Train by Random Forest Model

In [55]:
model = RandomForestRegressor(n_estimators=1000, max_depth=5, random_state=42)
model.fit(X_train, y_train)

In [66]:
importances = model.feature_importances_

# 각 feature의 중요도 출력
for feature, importance in zip(X_train.columns, importances):
    print(f"{feature}: {importance*100}")

event_id_max: 1.300251370850409
event_id_mean: 1.3291860190937312
action_time_mean: 0.4208681904036464
action_time_sum: 0.5822938186363246
down_time_diff: 0.0
down_time_diff_count: 0.3693933397232274
down_time_diff_max: 0.5576191589162689
down_time_diff_mean: 2.6035241429110028
activity_percentage_of_Input: 0.30161731748334336
activity_percentage_of_Remove_Cut: 0.250015914068914
activity_percentage_of_Nonproduction: 1.1119147689338253
activity_percentage_of_Replace: 0.10746907267760522
activity_percentage_of_Paste: 0.029135489892011614
down_event_percentage_of_q: 0.9331230789736045
down_event_percentage_of_Space: 1.8475748412177757
down_event_percentage_of_Backspace: 0.27516876230499004
down_event_percentage_of_Shift: 1.589205991855978
down_event_percentage_of_ArrowRight: 0.12160822416418583
down_event_percentage_of_Leftclick: 0.6362034985325297
down_event_percentage_of_punctuation: 1.5518698832862137
down_event_percentage_of_comma: 8.70667271866214
down_event_percentage_of_Enter: 1.46

# 5. Validation

In [57]:
val_predictions = model.predict(X_val)

rmse = mean_squared_error(y_val, val_predictions, squared=False)
print(f'rmse on Validation Set: {rmse}')

# (n_estimators=500, random_state=42) : 0.598080750265701
# (n_estimators=500, max_depth=4, random_state=42) : 0.6182429179862099
# (n_estimators=100, random_state=42) : 0.6005615638044046
# (n_estimators=1000, random_state=42) : 0.5973577879024234
# (n_estimators=1000, max_depth=5, random_state=42) : 0.609084895701057

rmse on Validation Set: 0.609084895701057


# 6. Predict for Test Set

In [58]:
fe = FeatureEngineering(test_logs)
fe.run()
test_data = fe.get_data()

In [59]:
test_data

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,...,down_event_percentage_of_Leftclick,down_event_percentage_of_punctuation,down_event_percentage_of_comma,down_event_percentage_of_Enter,down_event_percentage_of_Apostrophe,down_event_percentage_of_quote,down_event_percentage_of_Tab,word_count_max,word_count_mean,word_count_sum
0,0000aaaa,1,338433,338518,85,Input,Space,Space,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
1,0000aaaa,2,760073,760160,87,Input,Space,Space,,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
2,2222bbbb,1,711956,712023,67,Input,q,q,q,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,2
3,2222bbbb,2,290502,290548,46,Input,q,q,q,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,2
4,4444cccc,1,635547,635641,94,Input,Space,Space,,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.5,1
5,4444cccc,2,184996,185052,56,Input,q,q,q,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.5,1


In [60]:
# 필요없는 열을 제거한다.

drop_columns = ['id',
    'event_id', 'down_time', 'up_time', 
    'action_time', 'activity', 'down_event', 
    'up_event', 'text_change', 'cursor_position', 
    'word_count'
    ]

new_test_data = test_data.drop(columns=drop_columns).reset_index(drop=True)

In [61]:
new_test_data

Unnamed: 0,event_id_max,event_id_mean,action_time_mean,action_time_sum,down_time_diff,down_time_diff_count,down_time_diff_max,down_time_diff_mean,activity_percentage_of_Input,activity_percentage_of_Remove_Cut,...,down_event_percentage_of_Leftclick,down_event_percentage_of_punctuation,down_event_percentage_of_comma,down_event_percentage_of_Enter,down_event_percentage_of_Apostrophe,down_event_percentage_of_quote,down_event_percentage_of_Tab,word_count_max,word_count_mean,word_count_sum
0,2,1.5,86.0,172,0.0,1.0,421640.0,210820.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
1,2,1.5,86.0,172,421640.0,1.0,421640.0,210820.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
2,2,1.5,56.5,113,0.0,0.0,0.0,-210727.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,2
3,2,1.5,56.5,113,-421454.0,0.0,0.0,-210727.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1.0,2
4,2,1.5,75.0,150,0.0,0.0,0.0,-225275.5,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.5,1
5,2,1.5,75.0,150,-450551.0,0.0,0.0,-225275.5,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.5,1


In [62]:
new_test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 28 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   event_id_max                          6 non-null      int64  
 1   event_id_mean                         6 non-null      float64
 2   action_time_mean                      6 non-null      float64
 3   action_time_sum                       6 non-null      int64  
 4   down_time_diff                        6 non-null      float64
 5   down_time_diff_count                  6 non-null      float64
 6   down_time_diff_max                    6 non-null      float64
 7   down_time_diff_mean                   6 non-null      float64
 8   activity_percentage_of_Input          6 non-null      float64
 9   activity_percentage_of_Remove_Cut     6 non-null      float64
 10  activity_percentage_of_Nonproduction  6 non-null      float64
 11  activity_percentage_of_

In [63]:
test_predictions = model.predict(new_test_data)

In [64]:
submission = pd.DataFrame({'id': test_data['id'], 'score': test_predictions})
submission = submission.groupby('id').mean().reset_index()
submission.to_csv('submission.csv', index=False)

In [65]:
pd.read_csv('submission.csv')

Unnamed: 0,id,score
0,0000aaaa,1.57573
1,2222bbbb,1.69403
2,4444cccc,1.699614
