In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt



In [None]:
df = pd.read_csv("../data/keystrokes/raw/fixed-text.csv")
df.head()

In [None]:
df_raw = df.copy()

In [None]:
timing_cols = df.columns[3:-1] 

In [None]:
df_clean = df.copy()
df_clean[timing_cols] = df_clean[timing_cols].apply(pd.to_numeric, errors='coerce')
df_clean[timing_cols] = df_clean[timing_cols].clip(lower=0)
df_clean[timing_cols] = df_clean[timing_cols].fillna(0)

In [None]:
def create_features(df):
    timing_cols = df.columns[3:-1]

    df['mean_timing'] = df[timing_cols].mean(axis=1)
    df['std_timing'] = df[timing_cols].std(axis=1)
    df['min_timing'] = df[timing_cols].min(axis=1)
    df['max_timing'] = df[timing_cols].max(axis=1)
    df['median_timing'] = df[timing_cols].median(axis=1)

    df['WPM'] = 120 / df['total time']

    return df

In [None]:
df_raw = create_features(df_raw)
df_clean = create_features(df_clean)

In [None]:
df_raw.head()

In [None]:
df_clean.head()

In [None]:
df_clean[["mean_timing", "std_timing", "min_timing", "max_timing", 
          "median_timing", "total time"]].hist(bins=30, figsize=(12,8))
plt.tight_layout()
plt.show()

In [None]:
feature_cols = ['mean_timing','std_timing','min_timing','max_timing','median_timing','total time']
target_col = 'WPM'

# RAW
X_raw = df_raw[feature_cols]
y_raw = df_raw[target_col]

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(
    X_raw, y_raw, test_size=0.3, random_state=42
)

# CLEANED
X_clean = df_clean[feature_cols]
y_clean = df_clean[target_col]

X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(
    X_clean, y_clean, test_size=0.3, random_state=42
)

## Data Preprocessing for Fixed Text Keystroke Dynamics
### Cleaning Data, Handling Negative Values (Overlap), and Calculating Target Variable (WPM)

In [None]:
# --- Data Preprocessing for Fixed Text Keystroke Dynamics ---

# NOTE: If you previously loaded the data into a DataFrame named 'df' or 'df_raw',
# ensure you use the correct variable here. We assume 'df' is loaded or is loaded here.
try:
    # Attempt to read the file in the current working directory
    df = pd.read_csv("fixed-text.csv")
except FileNotFoundError:
    # Fallback/Suggestion: Use the relative path often seen in your notebook structure
    # df = pd.read_csv("../data/keystrokes/raw/fixed-text.csv")
    print("Error: 'fixed-text.csv' not found. Please adjust the file path.")
    # Stop execution if data cannot be loaded
    # exit()

# Identify all float columns for outlier checking and feature usage
float_cols = df.select_dtypes(include=['float64']).columns

# --- 2. Handle Corrupted Negative Outliers ---
# Small negative values (representing key overlap) are kept AS-IS, as they are meaningful features.
# Only the massive negative values (sentinel codes for corrupted data) are removed.

OUTLIER_THRESHOLD = -1000000

# Create a mask to identify rows where ANY float column is below the outlier threshold
outlier_rows_mask = (df[float_cols] < OUTLIER_THRESHOLD).any(axis=1)

# Remove the identified outlier rows
df_cleaned = df[~outlier_rows_mask].copy()

# Report on the cleaning step
removed_rows_count = outlier_rows_mask.sum()
print(f"Removed {removed_rows_count} row(s) containing corrupted sentinel negative values.")
print(f"Remaining rows in dataset: {len(df_cleaned)}")


# --- 3. Calculate Target Variable: Word Per Minute (WPM) ---
# The fixed text is "vp wjkeurkb", which has 10 characters (keys).
NUM_CHARS = 10 

# WPM calculation: (Characters / 5) / (Total Time in minutes)
df_cleaned['WPM'] = (NUM_CHARS / 5) / (df_cleaned['total time'] / 60)


# --- 4. Prepare Final Feature Set and Save ---
# Exclude 'total time' from features as WPM is directly derived from it.
feature_cols = [col for col in float_cols if col not in ['total time']]

# Final DataFrame structure for ML
df_ml = df_cleaned[['participant', 'session', 'repetition', 'WPM'] + feature_cols].copy()

# Save the Cleaned Data 
output_filename = "fixed_text_cleaned_for_ml.csv"
df_ml.to_csv(output_filename, index=False)

print(f"\nCleaned and preprocessed data saved to '{output_filename}'.")
print("\nFirst 5 rows of the final ML-ready DataFrame with WPM:")
print(df_ml.head())