<a href="https://colab.research.google.com/github/lygitdata/aml_project/blob/main/project2/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import libraries

In [1]:
import csv
import os
import biosppy.signals.ecg as ecg
import biosppy
import neurokit2 as nk
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
from scipy import stats
from scipy.signal import find_peaks, welch
from scipy.stats import kurtosis, skew
import pywt
import multiprocessing as mp
from tqdm import tqdm
import pickle
from imblearn import over_sampling, pipeline
import lightgbm as lgb
import catboost as cat
from xgboost import XGBRegressor
from sklearn import ensemble, model_selection, preprocessing, svm, linear_model
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, Matern, RationalQuadratic
from sklearn.metrics import r2_score, mean_squared_error

# Change this path to the folder that has your data
fpath = "aml_p2/data/"
RANDOM_STATE = 88

# Load data

In [4]:
X_train = np.load(f"{fpath}X_train.npy", allow_pickle=True)
X_test = np.load(f"{fpath}X_test.npy", allow_pickle=True)
y_train = np.load(f"{fpath}y_train.npy")

# Data processing

In [5]:
from sklearn.utils.class_weight import compute_class_weight

# Class labels
class_labels = np.unique(y_train)  # Assuming one-hot encoded y_train_updated

# Compute weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=class_labels,
    y=y_train  # Convert one-hot to class indices
)

# Convert to dictionary format
class_weights = dict(enumerate(class_weights))

print("Class Weights:", class_weights)


Class Weights: {0: 0.4221947194719472, 1: 2.8876975169300225, 2: 0.867876526458616, 3: 7.525}


In [6]:
print(
    "X_train shape: ",
    X_train.shape,
    "\nX_test shape",
    X_test.shape,
)

X_train shape:  (5117, 359) 
X_test shape (3411, 359)


In [14]:
# Replace None with np.nan
X_train = np.where(X_train == None, np.nan, X_train)
X_test = np.where(X_test == None, np.nan, X_test)

# Check for infinity or NaN values in X_train and replace them with a large finite number or the mean of the column
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)

print("Class Weights:", class_weights)
print("X_train shape: ", X_train.shape, "\nX_test shape", X_test.shape)

Class Weights: {0: 0.4221947194719472, 1: 2.8876975169300225, 2: 0.867876526458616, 3: 7.525}
X_train shape:  (5117, 359) 
X_test shape (3411, 359)


In [25]:
X_train = X_train.astype(np.float64)
X_test = X_test.astype(np.float64)

# Ensure no infinity or too large values
X_train = np.nan_to_num(X_train, nan=np.nanmean(X_train), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)
X_test = np.nan_to_num(X_test, nan=np.nanmean(X_test), posinf=np.finfo(np.float64).max, neginf=np.finfo(np.float64).min)

print("Data types in X_train:", set(map(type, X_train.flatten())))
print("Data types in X_test:", set(map(type, X_test.flatten())))
print("Largest value in X_train:", np.max(X_train))
print("Largest value in X_test:", np.max(X_test))
# Check the 20 largest values in X_train and X_test
largest_values_train = np.sort(X_train.flatten())[-20:]
largest_values_test = np.sort(X_test.flatten())[-20:]

print("20 largest values in X_train:", largest_values_train)
print("20 largest values in X_test:", largest_values_test)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


Data types in X_train: {<class 'numpy.float64'>}
Data types in X_test: {<class 'numpy.float64'>}
Largest value in X_train: 1.7976931348623157e+308
Largest value in X_test: 18054887424.0
20 largest values in X_train: [1.79769313e+308 1.79769313e+308 1.79769313e+308 1.79769313e+308
 1.79769313e+308 1.79769313e+308 1.79769313e+308 1.79769313e+308
 1.79769313e+308 1.79769313e+308 1.79769313e+308 1.79769313e+308
 1.79769313e+308 1.79769313e+308 1.79769313e+308 1.79769313e+308
 1.79769313e+308 1.79769313e+308 1.79769313e+308 1.79769313e+308]
20 largest values in X_test: [4.41139866e+09 4.62249216e+09 4.62445722e+09 4.64018125e+09
 4.69550490e+09 5.12602061e+09 5.23699661e+09 5.43251251e+09
 5.45307443e+09 6.16422400e+09 6.87561574e+09 7.21413530e+09
 7.69713510e+09 7.79243622e+09 8.43129702e+09 8.59020902e+09
 9.55752243e+09 1.11652229e+10 1.12790702e+10 1.80548874e+10]


# Model training

In [15]:
model = pipeline.make_pipeline(
    # over_sampling.RandomOverSampler(random_state=RANDOM_STATE),
    preprocessing.StandardScaler(),
    ensemble.StackingClassifier(
        estimators = [
            ("hgb", ensemble.HistGradientBoostingClassifier(l2_regularization=0.15, max_iter=400, random_state=0)),
            ("xgb", XGBRegressor(n_estimators=2000, learning_rate=0.11, max_depth=16, alphha=0.2, verbosity=0, random_state=0)),
            ("lgb", lgb.LGBMClassifier(n_estimators=2000, learning_rate=0.11, num_leaves=16, num_threads=128, verbose=0, random_state=0)),
        ],
        final_estimator=linear_model.RidgeClassifierCV()
    )
)

In [23]:
score = model_selection.cross_val_score(
    estimator=model,
    X=X_train,
    y=y_train,
    cv=6,
    n_jobs=-1
)
print("Mean F1 score: ", score.mean(), "\nStd. F1 score: ", score.std())

ValueError: 
All the 6 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Python\Python310\lib\site-packages\imblearn\utils\fixes.py", line 85, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Python\Python310\lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
  File "c:\Python\Python310\lib\site-packages\imblearn\pipeline.py", line 255, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "c:\Python\Python310\lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
  File "c:\Python\Python310\lib\site-packages\imblearn\pipeline.py", line 1104, in _fit_transform_one
    res = transformer.fit_transform(X, y, **params.get("fit_transform", {}))
  File "c:\Python\Python310\lib\site-packages\sklearn\base.py", line 870, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "c:\Python\Python310\lib\site-packages\sklearn\preprocessing\_data.py", line 809, in fit
    return self.partial_fit(X, y, sample_weight)
  File "c:\Python\Python310\lib\site-packages\sklearn\preprocessing\_data.py", line 844, in partial_fit
    X = self._validate_data(
  File "c:\Python\Python310\lib\site-packages\sklearn\base.py", line 577, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "c:\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 899, in check_array
    _assert_all_finite(
  File "c:\Python\Python310\lib\site-packages\sklearn\utils\validation.py", line 146, in _assert_all_finite
    raise ValueError(msg_err)
ValueError: Input X contains infinity or a value too large for dtype('float64').


# Generate prediction

In [34]:
def create_submission(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    pred = np.vstack((np.arange(X_test.shape[0]), pred)).T
    np.savetxt("submission.csv", pred, delimiter=",", header="id,y", comments="")


create_submission(model, X_train, y_train, X_test)