# Import Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import mstats
from scipy.stats import zscore
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet,LinearRegression
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.ensemble import VotingRegressor, RandomForestRegressor,GradientBoostingRegressor

# Data Preparation

In [None]:
train_logs = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
test_logs = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
train_scores = pd.read_csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")
train_logs.head(5)

In [None]:
test_logs.head(5)

In [None]:
train_scores.head(5)

In [None]:
data_frames = [train_logs, test_logs]
result_data = []

for df in data_frames:
    data = pd.DataFrame()
    data['events'] = df.groupby('id')['event_id'].max()
    data['up_time_sum'] = df.groupby('id')['up_time'].sum()
    data['up_time_mean'] = df.groupby('id')['up_time'].mean()
    data['down_time_sum'] = df.groupby('id')['down_time'].sum()
    data['down_time_mean'] = df.groupby('id')['down_time'].mean()
    data['action_time_sum'] = df.groupby('id')['action_time'].sum()
    data['action_time_mean'] = df.groupby('id')['action_time'].mean()
    data['word_count'] = df.groupby('id')['word_count'].max()
    
    for i in ['Leftclick', 'Backspace', 'Shift', 'Space', 'Enter', 'Tab', 'CapsLock', 'Control', 'Delete']:
        data[f"{i}_event"] = df[df['down_event'] == i].groupby("id")['down_event'].count()
        data[f"{i}_event"] = data[f"{i}_event"].fillna(0).astype("int64")
    
    result_data.append(data)

train_data, test_data = result_data

df = pd.merge(train_data, train_scores, on="id")
df.drop(["id"], inplace=True, axis=1)

df.head(10)

In [None]:
df.info()

# EDA

## Correlation Analysis

In [None]:
def analyze_variable_correlation(df, variable_groups, target_variable):
    subset_data = df[variable_groups + [target_variable]]

    corr_matrix = subset_data.corr()

    plt.figure(figsize=(8, 15))
    sns.heatmap(corr_matrix, annot=True, cmap="OrRd")
    plt.title("Correlation Between Variable")
    plt.show()

    for group in variable_groups:
        correlation = subset_data[group].corr(subset_data[target_variable])
        print(f"Correlation Between {group} And {target_variable}: {correlation:.3f}")

# Closest Possible
variable_group_cl = ['action_time_mean', 'word_count']
# Mean
variable_group_mean = ['up_time_mean', 'action_time_mean', 'down_time_mean']
# Event
variable_group_event = ['Leftclick_event', 'Backspace_event', 'Shift_event', 'Space_event', 'Enter_event', 'Tab_event', 'CapsLock_event', 'Control_event', 'Delete_event']
# Sum
variable_group_sum = ['up_time_sum', 'down_time_sum', 'action_time_sum']
# Targer
target_variable = 'score'

In [None]:
analyze_variable_correlation(df,variable_group_cl, target_variable)

In this analysis, we found that the variable 'action_time_mean' has a moderate negative correlation with the variable 'word_count' (-0.555) and a low negative correlation with the variable 'score' (-0.049). Simply put, it means that if someone spends more time typing, the number of words used in the essay is likely to decrease slightly. However, it's important to note that this relationship is not very strong and does not have a significant influence in this context.

Furthermore, there is no significant relationship between the average typing time ('action_time_mean') and the essay score ('score'). So, how long someone types is not directly related to the achieved essay score.

On the other hand, the variable 'word_count' has a moderate positive correlation with the variable 'score' (0.64). This means that using more words in the essay is likely to result in a higher essay score. This correlation indicates a stronger relationship compared to other variables in this analysis.

Additionally, the variable 'score' has a low negative correlation with the variable 'action_time_mean' (-0.049) and a moderate positive correlation with the variable 'word_count' (0.64). This means that the longer the average typing time, the essay score tends to be slightly lower. However, as mentioned earlier, this relationship is not very strong and does not have a significant influence in this context.

Based on the correlation matrix and heatmap, we can conclude that the variable 'word_count' has a more significant influence on the essay score compared to the variable 'action_time_mean'. So, if you want to improve your essay score, focus on using more words in your writing. However, it's important to remember that the influence of average typing time on the essay score is relatively weak and not significant.

In [None]:
analyze_variable_correlation(df,variable_group_mean, target_variable)

The analysis results indicate that there is a weak and negative relationship between typing behaviors such as lifting fingers from the keyboard (up_time_mean), typing action time (action_time_mean), and pressing keyboard keys (down_time_mean) with essay scores.

In simpler terms, the time spent lifting fingers, performing typing actions, or pressing keyboard keys while writing essays has a very small influence on the essay scores. In this case, the longer someone takes to lift their fingers, perform typing actions, or press keyboard keys while writing, the likelihood of the essay score being slightly lower.

In [None]:
analyze_variable_correlation(df,variable_group_sum, target_variable)

The correlation results indicate a positive relationship between three variables, namely "up_time_sum," "down_time_sum," and "action_time_sum," with the "score" variable.

When we spend more time on the "up" action (for example, pressing the up button), it is likely that our "score" will also tend to be higher.

The same applies to the "down" action (for example, pressing the down button). If we spend more time on the "down" action, it is likely that our "score" will also tend to be higher.

Furthermore, if we spend more time on both "up" and "down" actions combined, there is a tendency for our "score" to be higher.

In conclusion, the more time we spend on the "up" action, the "down" action, or both actions combined, the higher the likelihood of achieving a higher "score."

In [None]:
analyze_variable_correlation(df,variable_group_event, target_variable)

When it comes to typing, the way we perform certain actions can have varying effects on our overall "score." Let's take a closer look at these actions and their influence.

Firstly, the act of "left-clicking" doesn't seem to significantly impact our "score." Therefore, the manner in which we click the mouse doesn't hold much weight in determining our performance.

On the other hand, using the backspace key to delete characters exhibits a moderate influence on our "score." The more frequently we rely on this key, the higher the likelihood of achieving a better "score."

Similarly, pressing the Shift key also holds moderate significance. Although its impact may not be substantial, the way we employ the Shift key can subtly affect our overall "score."

Interestingly, the spacebar emerges as a powerful factor in determining our "score." The more we utilize this key, the higher our chances of achieving a better performance.

Meanwhile, pressing the Enter key doesn't seem to have a notable impact on our "score." Consequently, the specific manner in which we use this key isn't particularly crucial.

The Tab key, while having a weaker influence, still holds some sway over our "score." Although its effect may not be prominent, the way we navigate using the Tab key can slightly impact our overall performance.

When it comes to the Caps Lock key, its usage doesn't significantly affect our "score." In other words, the manner in which we utilize Caps Lock isn't of great importance.

As for the Control key, it doesn't possess a direct correlation with our "score." Consequently, the act of using the Control key doesn't impact our overall performance.

Lastly, the Delete key doesn't significantly influence our "score." This implies that the specific manner in which we delete text doesn't play a significant role in determining our performance.

In summary, these correlation findings shed light on the relationship between certain actions and our "score." While some actions, such as using the backspace key and spacebar, hold greater sway over our performance, others have minimal impact on our overall results.

### Correlation Conclusion

In this final conclusion, I will challenge myself to create an engaging and well-crafted narrative essay. I hope my story will captivate your interest and keep you awake throughout :D. So, without further ado, let's dive into the world of storytelling!


Once upon a time, there was a group of individuals who participated in a typing competition. They were all eager to achieve the highest possible "score" and showcase their typing skills. As they embarked on this journey, researchers closely observed their actions and analyzed the data to uncover the factors that influenced their performance.

The researchers noticed that some actions had a significant impact on the participants' "score," while others seemed less influential. Let's explore these findings together.

First, the researchers discovered that the average time it took for participants to perform each action had a weak negative correlation with their "score." This means that spending too much time on individual actions didn't necessarily lead to higher scores. It was more about finding a balance between speed and accuracy.

On the other hand, the number of words participants typed had a strong positive correlation with their "score." This finding emphasized the importance of productivity and being able to type a higher number of words within the given time.

The researchers also examined the average time participants spent on upward and downward movements while typing. Surprisingly, there was a weak negative correlation between the duration of these movements and the participants' "score." It seemed that excessive time spent on moving their fingers up and down didn't contribute significantly to their performance.

However, when the researchers looked at the total duration of these movements, they found a moderate positive correlation with the participants' "score." This indicated that participants who had a good balance between efficient movements and overall typing speed tended to achieve higher scores.

As the researchers delved deeper into the analysis, they explored the participants' usage of specific keys. They discovered that certain key events had varying degrees of correlation with the "score."

For example, the count of backspace events showed a moderate positive correlation with the "score." This suggested that participants who made use of the backspace key to correct their mistakes tended to have higher scores.

Similarly, the count of space events had a strong positive correlation with the "score." It appeared that participants who utilized the spacebar more frequently, indicating better sentence structure and spacing, tended to achieve higher scores.

Other key events, such as left-click, shift, enter, tab, caps lock, control, and delete, showed weak to moderate positive correlations with the "score." While these events had some influence on participants' performance, their impact was not as pronounced as the backspace and space events.

In a nutshell, the researchers concluded that in this typing competition, factors like word count, total duration of movements, and the counts of backspace and space events played significant roles in determining the participants' "score." Other variables, such as the average time for individual actions and counts of other key events, had weaker correlations or no substantial influence on the overall performance.

These findings provided valuable insights for the participants and anyone seeking to improve their typing skills. It highlighted the importance of finding a balance between speed and accuracy, focusing on productivity, and utilizing key functions like backspace and space effectively.

## Statistical Analysis

In [None]:
df1 = df.round(2)
df1.head(5)

In [None]:
df1.describe()

In [None]:
# Distribution And Skewness
plt.figure(figsize=(16, 12))
for i, column in enumerate(df1.columns):
    plt.subplot(5, 4, i+1)
    sns.histplot(df1[column], kde=True)
    plt.xlabel(column)
    plt.ylabel('Density')
    plt.title(f'Skewness: {skew(df1[column]):.2f}')
plt.tight_layout()
plt.show()

In [None]:
lower_limit = 0.05
upper_limit = 0.95

# Threshold
for column in df1.columns:
    winsorized_column = mstats.winsorize(df1[column], limits=[lower_limit, upper_limit])
    lower_bound = np.percentile(df1[column], lower_limit*100)
    upper_bound = np.percentile(df1[column], upper_limit*100)
    print(f"Threshold for {column}:")
    print(f"Lower: {lower_bound}")
    print(f"Upper: {upper_bound}")
    print()

In [None]:
# Delete Outlier
data = df1[
    (df1['events'] >= 1524.5) & (df1['events'] <= 6387.5) &
    (df1['up_time_sum'] >= 716237508.0) & (df1['up_time_sum'] <= 5796753380.0) &
    (df1['up_time_mean'] >= 352267.1) & (df1['up_time_mean'] <= 1096763.4) &
    (df1['down_time_sum'] >= 716034787.0) & (df1['down_time_sum'] <= 5796393767.5) &
    (df1['down_time_mean'] >= 352180.2) & (df1['down_time_mean'] <= 1096669.6) &
    (df1['action_time_sum'] >= 140468.5) & (df1['action_time_sum'] <= 615196.5) &
    (df1['action_time_mean'] >= 65.7) & (df1['action_time_mean'] <= 141.6) &
    (df1['word_count'] >= 204.0) & (df1['word_count'] <= 724.0) &
    (df1['Leftclick_event'] >= 4.0) & (df1['Leftclick_event'] <= 93.0) &
    (df1['Backspace_event'] >= 68.5) & (df1['Backspace_event'] <= 1001.5) &
    (df1['Shift_event'] >= 0.0) & (df1['Shift_event'] <= 527.5) &
    (df1['Space_event'] >= 234.5) & (df1['Space_event'] <= 910.5) &
    (df1['Enter_event'] >= 2.0) & (df1['Enter_event'] <= 18.0) &
    (df1['Tab_event'] >= 0.0) & (df1['Tab_event'] <= 2.0) &
    (df1['CapsLock_event'] >= 0.0) & (df1['CapsLock_event'] <= 42.0) &
    (df1['Control_event'] >= 0.0) & (df1['Control_event'] <= 6.0) &
    (df1['Delete_event'] >= 0.0) & (df1['Delete_event'] <= 17.0) &
    (df1['score'] >= 1.0) & (df1['score'] <= 5.5)
]

print(f'Before cleaning: {df1.shape[0]} line')
print(f'After cleaning: {data.shape[0]} line')

In [None]:
plt.figure(figsize=(16, 12))
for i, column in enumerate(data.columns):
    plt.subplot(5, 4, i+1)
    sns.histplot(data[column], kde=True)
    plt.xlabel(column)
    plt.ylabel('Density')
    plt.title(f'Skewness: {skew(data[column]):.2f}')
plt.tight_layout()
plt.show()

I think I've cut a lot of data, but that's okay, while I can still add it :D

In [None]:
data = pd.concat([data] * 10, ignore_index=True)

In [None]:
missing_values = data.isnull().sum()

print("Count of missing value:")
print(missing_values)

total_rows = data.shape[0]
missing_percentage = (missing_values / total_rows) * 100
print("Percentage of Missing Values ​​in Each Column:")
print(missing_percentage)

In [None]:
data.info()

# Linear Regression with Another ~~Love~~ model

In [None]:
# Separate between X (feature) and y (target) remove interdependent columns, to avoid overfitting

# Feature
X = data.drop(['score'], axis=1)
# Target
y = data['score']

In [None]:
# Normalization
scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.2, random_state=42)

In [None]:
# Feature Selection using selector
selector = SelectKBest(score_func=f_regression, k=10)  # Mengambil 10 fitur terbaik
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

In [None]:
models = {
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "ElasticNet": ElasticNet(),
    "Linear Regression": LinearRegression()
}

In [None]:
param_grid = {
    "Lasso": {'alpha': [0.1, 0.5, 1.0], 'fit_intercept': [True, False],'copy_X': [True, False]},
    "Ridge": {'alpha': [0.1, 0.5, 1.0], 'fit_intercept': [True, False], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga'],'copy_X': [True, False]},
    "ElasticNet": {'alpha': [0.1, 0.5, 1.0], 'l1_ratio': [0.2, 0.5, 0.8], 'fit_intercept': [True, False],'copy_X': [True, False]},
    "Linear Regression": {'fit_intercept': [True, False],'copy_X': [True, False]}
}

kfold = KFold(n_splits=5, shuffle=True, random_state=42)

for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[model_name], cv=kfold, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)

    print("Model:", model_name)
    print("Best Parameter:", grid_search.best_params_)

    best_score = np.sqrt(-grid_search.best_score_)
    print("Best MSE score:", best_score)
    print("---------------------------------------")

In [None]:
models_final = {
    "Linear Regression": LinearRegression(fit_intercept=True),
    "Ridge": Ridge(alpha=0.1, fit_intercept=True, solver='lsqr'),
    "Lasso": Lasso(alpha=0.1, fit_intercept=True),
    "ElasticNet": ElasticNet(alpha=0.01, l1_ratio=0.2, fit_intercept=True),
}

In [None]:
for model_name, model in models_final.items():

    model.fit(X_train, y_train)

    train_pred = model.predict(X_train)
    test_pred = model.predict(X_test)

    train_rmse = np.sqrt(mean_squared_error(y_train, train_pred))  # Ganti dengan metrik yang sesuai
    test_rmse = np.sqrt(mean_squared_error(y_test, test_pred))  # Ganti dengan metrik yang sesuai

    print("Model:", model_name)
    print("Train RMSE:", train_rmse)
    print("Test RMSE:", test_rmse)

    if train_rmse < test_rmse:
        print("Overfitting detected: Train RMSE is lower than test RMSE")
    else:
        print("No overfitting detected")
    print("---------------------------------------")

In [None]:
reg = LinearRegression(fit_intercept=True, copy_X=True)

In [None]:
reg.fit(X_train, y_train)

In [None]:
# Memprediksi nilai target dengan data uji
y_pred = reg.predict(X_test)

# Menghitung nilai MSE (Mean Squared Error)
mse = mean_squared_error(y_test, y_pred)

print("RMSE SCORE:", mse)

In [None]:
rf_model = RandomForestRegressor()
gb_model = GradientBoostingRegressor()

ensemble_model = VotingRegressor([('reg', reg), ('rf', rf_model), ('gb', gb_model)])
ensemble_model.fit(X_train, y_train)
ensemble_pred = ensemble_model.predict(X_test)
ensemble_rmse = mean_squared_error(y_test, ensemble_pred)

In [None]:
print("Improved ensemble RMSE with Linear Regression:", ensemble_rmse)

In [None]:
y_inf = pd.DataFrame(ensemble_pred, columns=['score'])
y_inf.head(5)

In [None]:
testing = pd.concat([train_logs['id'],X, y_inf], axis=1)
testing. head(10)

# Final Submission

In [None]:
submission = testing[['id','score']].head(20)
submission.head(5)

In [None]:
submission.to_csv("submission.csv",index=False,header=True)

# Message

several conclusions from my modeling and evaluation:

I've conducted numerous experiments varying the amount of data, and it's clear that the RMSE results are influenced by the dataset size. However, I've found that beyond 10,000 data points, the processing time becomes too long for me to wait :D...

If you found this information helpful, please consider giving it a vote... Thank You...