In [10]:
import torch
import os
import torch.nn as nn
import pandas as pd
import numpy as np
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import KFold
import numpy as np
import sys
sys.path.insert(1, os.getcwd())

from data_processing.feature_engineering import *

# CFG


In [2]:
# Data
DATA_ROOT = "/Users/kaiqu/kaggle-datasets/linking-writing-processes-to-writing-quality"
MODEL_PATH = "../SavedModels/"
df = pd.read_csv(f'{DATA_ROOT}/train_logs.csv')
df_score = pd.read_csv(f'{DATA_ROOT}/train_scores.csv')

In [3]:
class CFG:
    random_state = 42
    
    model_name = 'cnn'
    lr = 0.001

# Data

In [4]:
# Getting features
def get_all_features(df):
    """
    Get all features for the given dataframe.

    Parameters:
    df (pandas.DataFrame): The input dataframe.

    Returns:
    pandas.DataFrame: The dataframe with all the features.
    """
    feature_df = get_paragraph_features(df)
    return feature_df


def get_word_features(df, essay_df=None):
    pass


def get_sentence_features(df, essay_df=None):
    pass


# ? Do we have to make sure that the passed df are not changed in anyway?
# ? What actions of df will return a new df?
def get_paragraph_features(df, essay_df=None):
    feature_df = df[["id"]].copy()
    # get the word length features
    # TODO: need to rewrite the below shit; figure out a more elegant way to make sure that the indices are aligned
    df_max_event = df.loc[df.groupby("id")["event_id"].idxmax()]
    feature_df = feature_df.merge(df_max_event, on="id", how="left")
    feature_df = feature_df[["word_count"]].copy()
    # print(feature_df.head())
    return feature_df

# Model

In [5]:
class KeystrokeCNN(nn.Module):
    def __init__(self, num_features):
        super(KeystrokeCNN, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv1d(in_channels=num_features, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2, stride=2)
        )
        self.drop_out = nn.Dropout()
        self.fc1 = nn.Linear(128, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)  # Flatten the output
        out = self.drop_out(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

# Training

In [6]:
def train():
    # 5-fold cross validation
    train_data = df.merge(df_score, on="id")

    X = get_all_features(df)
    y = train_data["score"]
    kf = KFold(n_splits=5, shuffle=True, random_state=CFG.random_state)

    for fold, (train_index, valid_index) in enumerate(kf.split(X, y)):
        print(f"Training fold {fold + 1}")
        X_train, X_val = X.iloc[train_index], X.iloc[valid_index]
        y_train, y_val = y.iloc[train_index], y.iloc[valid_index]
        model = train_one_loop(X_train, X_val, y_train, y_val)
        torch.save(model.state_dict(), f'{MODEL_PATH}/{CFG.model_name}_{fold}.pth')


def train_one_loop(X_train, X_val, y_train, y_val):
    # Convert data to PyTorch tensors
    X_train_tensor = torch.Tensor(X_train)
    X_val_tensor = torch.Tensor(X_val)
    y_train_tensor = torch.Tensor(y_train)
    y_val_tensor = torch.Tensor(y_val)

    # Create Tensor datasets
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

    # Data loaders
    train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)

    # Model
    model = KeystrokeCNN(num_features=X_train.shape[1])
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    # Training Loop
    num_epochs = 10
    for epoch in range(num_epochs):
        model.train()
        for i, (inputs, labels) in enumerate(train_loader):
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                outputs = model(inputs)
                val_loss += criterion(outputs, labels).item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Validation Loss: {val_loss/len(val_loader):.4f}')
    return model


In [8]:
train()

Training fold 1


ValueError: could not determine the shape of object type 'DataFrame'

In [7]:
# Evaluation
# from math import sqrt

# model.eval()
# total_val_loss = 0
# with torch.no_grad():
#     for inputs, labels in val_loader:
#         outputs = model(inputs)
#         total_val_loss += criterion(outputs, labels).item()

# rmse = sqrt(total_val_loss / len(val_loader))
# print(f'RMSE: {rmse}')