In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside 

In [None]:
dir = '/kaggle/input/linking-writing-processes-to-writing-quality/'
train_logs = pd.read_csv('{}train_logs.csv'.format(dir))
train_scores = pd.read_csv('{}train_scores.csv'.format(dir))
test_logs = pd.read_csv('{}test_logs.csv'.format(dir))


In [None]:
train_logs.head()


In [None]:
train_logs = train_logs.sort_values(by=['event_id'])
def agg_data(df):
    df['activity'] = df['activity'].apply(lambda x: "Move Cursor" if str(x).__contains__("Move") else x)
    df['text_change'] = df['text_change'].apply(lambda x: 0 if str(x).__contains__("NoChange") else 1)
    new_features = df
    new_features = df.groupby(['id']).agg(
        sum_of_action_time =('action_time','sum'),
        mean_action_time =('action_time','mean'),
        min_action_time =('action_time','min'),
        max_action_time =('action_time','max'),
        sum_of_up_time =('up_time','sum'),
        mean_up_time =('up_time','mean'),
        min_up_time =('up_time','min'),
        max_up_time =('up_time','max'),
        std_up_time =('up_time','std'),
        sum_of_down_time =('down_time','sum'),
        mean_down_time =('down_time','mean'),
        min_down_time =('down_time','min'),
        max_down_time =('down_time','max'),
        std_down_time =('down_time','std'),
        std_word_count = ('word_count','std'),
        mean_word_count = ('word_count','mean'),
        max_word_count=('word_count','max'),
        min_word_count=('word_count','min'),
        sum_word_count=('word_count','sum'),
        down_event_count = ('down_event','count'),
        up_event_count = ('up_event','count'),
        text_change_sum = ('text_change','sum'),
        activity_count = ('activity','count'),
        unique_activity = ('activity','nunique'),
        unique_up_event = ('up_event','nunique'),
        unique_down_event = ('down_event','nunique'),
        )
    new_features['w2t_ratio'] = new_features['sum_word_count']/ new_features['text_change_sum']
    new_features['diff_mean_max_word'] = new_features['sum_word_count'].apply(lambda x: new_features['sum_word_count'].mean()-x)
    new_features['diff_up_down_time'] = new_features['sum_of_up_time'] - new_features['sum_of_down_time']
    new_features['diff_min_max_word'] = new_features['max_word_count'] - new_features['min_word_count']
    
    return new_features.reset_index()

In [None]:
new_features = agg_data(train_logs)
test_logs = agg_data(test_logs)
new_features.head()


In [None]:
new_features = pd.merge(left=new_features,right=train_scores,on='id',how='left')

In [None]:
features_t = new_features.drop(columns=['id']).columns
df_corr = new_features[features_t].corr('pearson')

In [None]:
plt.figure(figsize=(16, 16))
heatmap = sns.heatmap(df_corr, annot=True, fmt=".1g", vmin=-1, vmax=1, center=0, cmap="cool", linewidths=1, linecolor="black")
heatmap.set_title("Correlation Heatmap Between Variables")
heatmap.set_xticklabels(heatmap.get_xticklabels(), rotation=90)

In [None]:
x = new_features[features_t.drop('score')].values
y = new_features[['score']].values
estimators = [('gbr', GradientBoostingRegressor()),('rfr', RandomForestRegressor(n_estimators=1400)),('xgb',XGBRegressor(n_estimators=1400))]
model = StackingRegressor(
estimators=estimators,
final_estimator=LinearRegression())
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.06)
model.fit(xtrain,ytrain.ravel())

In [None]:
pred = model.predict(xtest)
print("Mean squared error: %.2f" % mean_squared_error(pred, ytest))

In [None]:

test_logs.head()
pred_y = model.predict(test_logs[features_t.drop('score')].values)

test_logs['score']= pred_y
outp = test_logs[["id","score"]]
outp.to_csv("submission.csv",index=False)