<a href="https://colab.research.google.com/github/lygitdata/aml_project/blob/main/project2/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [3]:
import csv
import os
import biosppy.signals.ecg as ecg
import biosppy
import neurokit2 as nk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from scipy.signal import find_peaks, welch
from scipy.stats import kurtosis, skew
import pywt
import multiprocessing as mp
from tqdm import tqdm
import pickle
from imblearn import over_sampling, pipeline
import lightgbm as lgb
import catboost as cat
from xgboost import XGBClassifier
from sklearn import ensemble, model_selection, preprocessing, svm, linear_model
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.metrics import r2_score, mean_squared_error

# Change this path to the folder that has your data
fpath = "aml_p2/data/"
RANDOM_STATE = 88

# Load data

In [4]:
X_train = np.load(f"{fpath}X_train.npy", allow_pickle=True)
X_test = np.load(f"{fpath}X_test.npy", allow_pickle=True)
y_train = np.load(f"{fpath}y_train.npy")

# Data processing

In [5]:
from sklearn.utils.class_weight import compute_class_weight

# Class labels
class_labels = np.unique(y_train)  # Assuming one-hot encoded y_train_updated

# Compute weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=class_labels,
    y=y_train  # Convert one-hot to class indices
)

# Convert to dictionary format
class_weights = dict(enumerate(class_weights))

print("Class Weights:", class_weights)


Class Weights: {0: 0.4221947194719472, 1: 2.8876975169300225, 2: 0.867876526458616, 3: 7.525}


In [6]:
print(
    "X_train shape: ",
    X_train.shape,
    "\nX_test shape",
    X_test.shape,
)

X_train shape:  (5117, 328) 
X_test shape (3411, 328)


In [7]:
# Replace None with np.nan
X_train = np.where(X_train == None, np.nan, X_train)
X_test = np.where(X_test == None, np.nan, X_test)

# Check for infinity or NaN values in X_train and replace them with a large finite number or the mean of the column
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)

print("Class Weights:", class_weights)
print("X_train shape: ", X_train.shape, "\nX_test shape", X_test.shape)

Class Weights: {0: 0.4221947194719472, 1: 2.8876975169300225, 2: 0.867876526458616, 3: 7.525}
X_train shape:  (5117, 328) 
X_test shape (3411, 328)


In [8]:
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

# Ensure no infinity or too large values
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)

print("Data types in X_train:", set(map(type, X_train.flatten())))
print("Data types in X_test:", set(map(type, X_test.flatten())))
print("Largest value in X_train:", np.max(X_train))
print("Largest value in X_test:", np.max(X_test))

# Find the indices of the largest values
largest_value_indices_train = np.unravel_index(np.argsort(X_train, axis=None)[-20:], X_train.shape)
largest_value_indices_test = np.unravel_index(np.argsort(X_test, axis=None)[-20:], X_test.shape)

print("Indices of the 20 largest values in X_train:", largest_value_indices_train)
print("Indices of the 20 largest values in X_test:", largest_value_indices_test)

# # Discard columns with the largest values
# columns_to_discard_train = np.unique(largest_value_indices_train[1])
# columns_to_discard_test = np.unique(largest_value_indices_test[1])

# X_train = np.delete(X_train, columns_to_discard_train, axis=1)
# X_test = np.delete(X_test, columns_to_discard_test, axis=1)

# print("X_train shape after discarding columns:", X_train.shape)
# print("X_test shape after discarding columns:", X_test.shape)
# Check the 20 largest values in X_train and X_test
largest_values_train = np.sort(X_train.flatten())[-20:]
largest_values_test = np.sort(X_test.flatten())[-20:]

print("20 largest values in X_train:", largest_values_train)
print("20 largest values in X_test:", largest_values_test)

Data types in X_train: {<class 'numpy.float64'>}
Data types in X_test: {<class 'numpy.float64'>}
Largest value in X_train: 13334195200.0
Largest value in X_test: 18054887424.0
Indices of the 20 largest values in X_train: (array([1147, 1476, 2949, 2370, 3078, 1380, 1520, 1579, 2626,  415, 4574,
        715, 3098, 2135, 1832, 4341, 1998, 4945,  840, 2369], dtype=int64), array([322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322,
       322, 322, 322, 322, 322, 322, 322], dtype=int64))
Indices of the 20 largest values in X_test: (array([1276, 1958,  225, 2934,  337,  619, 2937, 1207, 1670, 2865, 2600,
       2301, 2809, 2764, 2213,  980,  118,  612,  164, 2872], dtype=int64), array([322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322, 322,
       322, 322, 322, 322, 322, 322, 322], dtype=int64))
20 largest values in X_train: [4.78489856e+09 4.81385574e+09 4.84008141e+09 5.07735296e+09
 5.38971853e+09 5.42992640e+09 5.47360512e+09 5.72462797e+09
 5.86773658e+09 5.9398031

# Model training

In [9]:
model = pipeline.make_pipeline(
    # over_sampling.RandomOverSampler(random_state=RANDOM_STATE),
    preprocessing.StandardScaler(),
    ensemble.StackingClassifier(
        estimators = [
            ("hgb", ensemble.HistGradientBoostingClassifier(l2_regularization=0.15, max_iter=400, random_state=0)),
            ("xgb", XGBClassifier(n_estimators=2000, learning_rate=0.11, max_depth=16, alphha=0.2, verbosity=0, random_state=0)),
            ("lgb", lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.11, num_leaves=16, num_threads=128, verbose=0, random_state=0)),
        ],
        final_estimator=linear_model.RidgeClassifierCV()
    )
)

In [48]:
score = model_selection.cross_val_score(
    estimator=model,
    X=X_train,
    y=y_train,
    cv=6,
    n_jobs=-1
)
print("Mean F1 score: ", score.mean(), "\nStd. F1 score: ", score.std())

Mean F1 score:  0.789718740998813 
Std. F1 score:  0.008571388027527448


# Generate prediction

In [10]:
def create_submission(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = np.vstack((np.arange(X_test.shape[0]), pred)).T
    np.savetxt("submission.csv", pred, delimiter=",", header="id,y", comments="")


create_submission(model, X_train, y_train, X_test)

