In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



In [None]:
df = pd.read_csv("../data/keystrokes/raw/fixed-text.csv")
df.head()

In [None]:
df_raw = df.copy()

In [None]:
timing_cols = df.columns[3:-1] 

In [None]:
# --- CORRECTED DATA CLEANING: Remove ONLY Sentinel Outliers (Keep Small Negatives) ---

# Threshold for the massive corrupted outliers (sentinel values)
OUTLIER_THRESHOLD = -1000000

# Create df_clean copy
df_clean = df.copy() 

# Use the previously defined 'timing_cols' for cleaning
float_cols_clean = df_clean[timing_cols].columns

# Create a mask to identify rows where ANY timing column is below the outlier threshold
outlier_rows_mask = (df_clean[float_cols_clean] < OUTLIER_THRESHOLD).any(axis=1)

# Remove the identified outlier rows from df_clean
df_clean = df_clean[~outlier_rows_mask].copy()

removed_rows_count = outlier_rows_mask.sum()
print(f"Removed {removed_rows_count} row(s) containing corrupted sentinel negative values from df_clean.")

# Small negative values (representing key overlap) are correctly kept AS-IS.

In [None]:
def create_features(df):
    # NOTE: The WPM calculation here (120/total time) is correct for a 10-character, 2-word phrase.
    timing_cols = df.columns[3:-1]

    df['mean_timing'] = df[timing_cols].mean(axis=1)
    df['std_timing'] = df[timing_cols].std(axis=1)
    df['min_timing'] = df[timing_cols].min(axis=1)
    df['max_timing'] = df[timing_cols].max(axis=1)
    df['median_timing'] = df[timing_cols].median(axis=1)

    df['WPM'] = 120 / df['total time']

    return df

In [None]:
df_raw = create_features(df_raw)
df_clean = create_features(df_clean)

In [None]:
df_raw.head()

In [None]:
df_clean.head()

In [None]:
df_clean[["mean_timing", "std_timing", "min_timing", "max_timing", 
          "median_timing", "total time"]].hist(bins=30, figsize=(12,8))
plt.tight_layout()
plt.show()

In [None]:
feature_cols = ['mean_timing','std_timing','min_timing','max_timing','median_timing','total time']
target_col = 'WPM'

# RAW
X_raw = df_raw[feature_cols]
y_raw = df_raw[target_col]

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.3, random_state=42
)

# CLEANED
X_clean = df_clean[feature_cols]
y_clean = df_clean[target_col]

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.3, random_state=42
)

## Data Preprocessing for Fixed Text Keystroke Dynamics
### Cleaning Data, Handling Negative Values (Overlap), and Calculating Target Variable (WPM)

In [None]:
# --- Save Cleaned Data to CSV ---

# We save the df_clean DataFrame which has the correct cleaning applied
# and the features (including WPM) calculated.
output_filename = "../data/keystrokes/processed/fixed_text_cleaned_for_ml.csv"
df_clean.to_csv(output_filename, index=False)

print(f"Cleaned and preprocessed data saved to '{output_filename}'.")