In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error
from sklearn.model_selection import cross_val_predict
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from hyperopt import fmin, tpe, hp, Trials

In [2]:
scores = pd.read_csv("train_scores.csv")
train = pd.read_csv("train_logs.csv")
train["activity2"] = train['activity'].apply(lambda row: 'Move' if 'Move' in row else row)

In [3]:
def final_word_count(df):
    return df.iloc[-1]['word_count']

def add_features(df):
    df["unsurity"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Remove/Cut")).apply(lambda x: int(x))
    df["structural_change"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Replace")).apply(lambda x: int(x))
    df["long_paste"] = ((df['text_change'].str.len() > 50) & (df['activity2']=="Paste")).apply(lambda x: int(x))
    df["unproductive_time"] = (train['activity2'] == "Nonproduction")*train['action_time']
    df["external_help"] = ((df['word_count'] < 10) & (df['activity2']=="Paste")).apply(lambda x: int(x))
    df["pasted_words_number"] = (train['activity2'] == "Paste")*train['text_change'].str.split().apply(len)
    df["large_changes"] = (df['text_change'].str.len() > 50).apply(lambda x: int(x))
    return df

def drop_unrelated_features(df, feat):
    df = df.drop(feat, axis = 1)
    return df

In [4]:
grouped_data = train.groupby(['id', "up_event"])
down = grouped_data["event_id"].count()
pivot = pd.pivot_table(data = down.reset_index(), index = "id", columns = "up_event", values = "event_id" ).fillna(0)
train = add_features(train)
train.head()

Unnamed: 0,id,event_id,down_time,up_time,action_time,activity,down_event,up_event,text_change,cursor_position,word_count,activity2,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes
0,001519c8,1,4526,4557,31,Nonproduction,Leftclick,Leftclick,NoChange,0,0,Nonproduction,0,0,0,31,0,0,0
1,001519c8,2,4558,4962,404,Nonproduction,Leftclick,Leftclick,NoChange,0,0,Nonproduction,0,0,0,404,0,0,0
2,001519c8,3,106571,106571,0,Nonproduction,Shift,Shift,NoChange,0,0,Nonproduction,0,0,0,0,0,0,0
3,001519c8,4,106686,106777,91,Input,q,q,q,1,1,Input,0,0,0,0,0,0,0
4,001519c8,5,107196,107323,127,Input,q,q,q,2,1,Input,0,0,0,0,0,0,0


In [5]:
grouped_data = train.groupby('id')
user = grouped_data.apply(lambda df: pd.Series({
    'Final Word Count': final_word_count(df)
}))

In [6]:
one_hot_encoded = pd.get_dummies(train['activity2'], prefix='activity').astype(int)

# Concatenate the one-hot encoded columns with the original DataFrame
train = pd.concat([train, one_hot_encoded], axis=1)
train = train.drop("activity2", axis = 1)
train = train.drop("up_event", axis = 1)


In [7]:
agg_columns = {
    'event_id':'count',
    'action_time': 'sum',
    'unsurity': 'sum',
    'structural_change': 'sum',
    'long_paste': 'sum',
    'unproductive_time': 'sum',
    'external_help': 'sum',
    'pasted_words_number': 'sum',
    'large_changes': 'sum',
    'activity_Input': 'sum',
    'activity_Move': 'sum',
    'activity_Nonproduction': 'sum',
    'activity_Paste': 'sum',
    'activity_Remove/Cut': 'sum',
    'activity_Replace': 'sum'
}
user_level = train.groupby('id').agg(agg_columns)
user_level.head()

Unnamed: 0_level_0,event_id,action_time,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes,activity_Input,activity_Move,activity_Nonproduction,activity_Paste,activity_Remove/Cut,activity_Replace
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
001519c8,2557,297243,0,0,0,18506,0,0,0,2010,3,120,0,417,7
0022f953,2454,275391,0,0,0,13781,0,0,0,1938,0,254,1,260,1
0042269b,4136,421201,1,4,0,33951,0,0,5,3515,0,175,0,439,7
0059420b,1556,189596,0,0,0,3062,0,1,0,1304,0,99,1,151,1
0075873a,2531,313702,0,0,0,6988,0,0,0,1942,0,72,0,517,0


In [8]:
merged_data = pd.merge(user_level, pivot, on='id', how='inner')
final_merge = pd.merge(merged_data, user, on='id', how='inner')
final_merge = final_merge.reset_index()
# final_merge.to_csv("train_final.csv", index = None)

In [9]:
final_merge.describe()

Unnamed: 0,event_id,action_time,unsurity,structural_change,long_paste,unproductive_time,external_help,pasted_words_number,large_changes,activity_Input,...,,¡,¿,Â´,Ä±,Å,Ë,â,ä,Final Word Count
count,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,...,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0,2471.0
mean,3401.820316,333667.5,0.352084,0.163497,0.102388,17909.078106,0.004856,5.253339,0.628895,2722.297046,...,0.000405,0.000405,0.000809,0.000405,0.004047,0.000405,0.000405,0.001619,0.000405,386.1121
std,1578.850387,157520.2,0.887996,0.536878,0.424501,32568.430834,0.075129,42.045249,1.458841,1196.384644,...,0.020117,0.020117,0.028444,0.020117,0.14222,0.020117,0.020117,0.080468,0.020117,171.773394
min,262.0,13452.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,230.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0
25%,2193.5,211148.0,0.0,0.0,0.0,3993.0,0.0,0.0,0.0,1786.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,250.0
50%,3082.0,304951.0,0.0,0.0,0.0,9308.0,0.0,0.0,0.0,2477.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,346.0
75%,4301.0,424814.0,0.0,0.0,0.0,19685.5,0.0,0.0,1.0,3397.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,477.0
max,12876.0,1210508.0,9.0,6.0,6.0,482115.0,2.0,1016.0,17.0,9091.0,...,1.0,1.0,1.0,1.0,5.0,1.0,1.0,4.0,1.0,1326.0


In [10]:
transformed = final_merge.copy()
col = final_merge.columns
transformed[col[1:]] = final_merge[col[1:]].apply(lambda x: x.apply(lambda y: np.log1p(y)))
# transformed.to_csv("transformed.csv")

# scores = pd.read_csv("train_scores.csv")
final_v1 = pd.merge(transformed, scores, on='id', how='inner')

# final_v1.to_csv("Final_v1.csv")
x = transformed.drop(columns=["id"])
y = scores["score"]

# Split the data into training and testing sets (adjust the test_size parameter as needed)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.1, random_state=1
)

x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.1, random_state=1
)

In [13]:
# Apply PCA
pca = PCA()
xtrain_pca = pca.fit_transform(x_train)

# Calculate the cumulative explained variance
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)

# Determine the number of components to capture a specified variance threshold (e.g., 95%)
variance_threshold = 0.95
num_components = np.argmax(cumulative_explained_variance >= variance_threshold) + 1

# Use the specified number of components
pca = PCA(n_components=num_components)
xtrain_pca = pca.fit_transform(x_train)
xval_pca = pca.transform(x_val)
xtest_pca = pca.transform(x_test)

# Create a DataFrame with the principal components
xtrain_pca_df = pd.DataFrame(data=xtrain_pca, columns=[f'PC{i}' for i in range(1, num_components + 1)])
xval_pca_df = pd.DataFrame(data=xval_pca, columns=[f'PC{i}' for i in range(1, num_components + 1)])
xtest_pca_df = pd.DataFrame(data=xtest_pca, columns=[f'PC{i}' for i in range(1, num_components + 1)])

# Print the explained variance ratio and number of components
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Number of Components:", num_components)

Explained Variance Ratio: [0.30224046 0.14672836 0.07983854 0.05341544 0.0506591  0.03920866
 0.03177747 0.02834841 0.02593465 0.01816751 0.01742956 0.01639316
 0.01472954 0.01288028 0.01255977 0.0117721  0.01110459 0.0107669
 0.00875303 0.00839945 0.00647699 0.0062563  0.00602025 0.00575026
 0.00526908 0.00429347 0.0040662  0.00392673 0.00384198 0.00362141]
Number of Components: 30


In [16]:
def objective(params):
    # Set hyperparameters
    model = GradientBoostingRegressor(
        learning_rate=params['learning_rate'],
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        subsample=params['subsample']
    )

    # Make predictions using cross-validation on the training set
    predictions = cross_val_predict(model, xtrain_pca_df, y_train.values.flatten(), cv=5)

    # Calculate RMSE (you can replace this with your own evaluation metric)
    rmse = np.sqrt(mean_squared_error(y_train, predictions))

    # Return the value to be minimized (in this case, RMSE)
    return rmse

# Define the search space for hyperparameters
space = {
    'learning_rate': hp.loguniform('learning_rate', -5, 0),
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),
    'subsample': hp.uniform('subsample', 0.5, 1)
}

# Specify the optimization algorithm (Tree-structured Parzen Estimator)
tpe_algorithm = tpe.suggest

# Create Trials object to store optimization results
trials = Trials()

# Run the optimization
best = fmin(fn=objective,
            space=space,
            algo=tpe_algorithm,
            trials=trials,
            max_evals=100)  # You can adjust the number of evaluations


print("Best Hyperparameters:", best)

In [None]:
best_params = {
    'learning_rate': best['learning_rate'],
    'n_estimators': int(best['n_estimators']),
    'max_depth': int(best['max_depth']),
    'min_samples_split': int(best['min_samples_split']),
    'min_samples_leaf': int(best['min_samples_leaf']),
    'subsample': best['subsample']
}


best_model = GradientBoostingRegressor(**best_params)
best_model.fit(xtrain_pca_df, y_train.values.flatten())

# Make predictions on the test set
test_predictions = best_model.predict(xtest_pca_df)


test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
print("Test RMSE:", test_rmse)

In [None]:
def objective(params):
    # Set hyperparameters
    model = RandomForestRegressor(
        n_estimators=int(params['n_estimators']),
        max_depth=int(params['max_depth']),
        min_samples_split=int(params['min_samples_split']),
        min_samples_leaf=int(params['min_samples_leaf']),
        max_features=params['max_features'],
        random_state=42  # Set a fixed random state for reproducibility
    )

    # Make predictions using cross-validation on the training set
    predictions = cross_val_predict(model, xtrain_pca_df, y_train.values.flatten(), cv=5)

    # Calculate RMSE (you can replace this with your own evaluation metric)
    rmse = np.sqrt(mean_squared_error(y_train, predictions))

    # Return the value to be minimized (in this case, RMSE)
    return rmse

# Define the search space for hyperparameters
space = {
    'n_estimators': hp.quniform('n_estimators', 50, 200, 1),
    'max_depth': hp.quniform('max_depth', 3, 10, 1),
    'min_samples_split': hp.quniform('min_samples_split', 2, 10, 1),
    'min_samples_leaf': hp.quniform('min_samples_leaf', 1, 5, 1),
    'max_features': hp.uniform('max_features', 0.1, 1.0)
}

# Specify the optimization algorithm (Tree-structured Parzen Estimator)
tpe_algorithm = tpe.suggest

# Create Trials object to store optimization results
trials = Trials()

# Run the optimization
best = fmin(fn=objective,
            space=space,
            algo=tpe_algorithm,
            trials=trials,
            max_evals=100)  # adjust the number of evaluations here


print("Best Hyperparameters:", best)

In [None]:
best_params = {
    'n_estimators': int(best['n_estimators']),
    'max_depth': int(best['max_depth']),
    'min_samples_split': int(best['min_samples_split']),
    'min_samples_leaf': int(best['min_samples_leaf']),
    'max_features': best['max_features'],
    'random_state': 42  # Set a fixed random state for reproducibility
}


best_model = RandomForestRegressor(**best_params)
best_model.fit(xtrain_pca_df, y_train.values.flatten())

# Make predictions on the test set
test_predictions = best_model.predict(xtest_pca_df)


test_rmse = np.sqrt(mean_squared_error(y_test, test_predictions))
print("Test RMSE:", test_rmse)

In [None]:
class MLPRegression(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLPRegression, self).__init__()
        self.fc1 = nn.Linear(input_size, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 64)
        self.fc4 = nn.Linear(64, output_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        return x


X_train_tensor = torch.tensor(xtrain_pca_df.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

X_test_tensor = torch.tensor(xval_pca_df.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_val.values, dtype=torch.float32)

# Define the model
input_size = xtrain_pca_df.shape[1]  
output_size = 1  
model = MLPRegression(input_size, output_size)


criterion = nn.MSELoss()

# Define the optimizer
optimizer = optim.Adam(model.parameters(), lr=0.0001)  # adjust the learning rate here

# Create a DataLoader for batch training
batch_size = 64  
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


val_dataset = TensorDataset(X_test_tensor, y_test_tensor)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

epoch_list = []
training_error = []
validation_error = []

num_epochs = 200  # adjust the number of epochs here
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    epoch_list.append(epoch)
    

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = torch.sqrt(criterion(outputs, targets))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_loader)
    
    
    model.eval()
    val_loss = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            val_loss += torch.sqrt(criterion(outputs, targets))

    average_val_loss = val_loss / len(val_loader)
    
    # print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {average_loss:.4f} | Validation Loss: {average_val_loss:.4f}')
    training_error.append(average_loss)
    validation_error.append(average_val_loss)
    
plt.plot(epoch_list, training_error, label='Training Loss',linestyle='-')
plt.plot(epoch_list, validation_error, label='Validation Loss',linestyle='-')
plt.xlabel('Epoch')
plt.ylabel('Root Mean Squared Error')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
print(min(validation_error))
plt.show()


In [None]:
X_test_tensor = torch.tensor(xtest_pca_df.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Put the model in evaluation mode
model.eval()

# Make predictions on the test set
with torch.no_grad():
    test_outputs = model(X_test_tensor)

# Calculate the test loss
test_loss = torch.sqrt(criterion(test_outputs, y_test_tensor))
print("Test RMSE:", test_loss.item())