In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
import xgboost as xgb

In [1]:
def encoder(df, col_split):
    continuous_cols = df.columns[col_split:]
    categorical_cols = df.columns[:col_split]

    # Standardizing the continuous features
    scaler = StandardScaler()
    df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

    # Applying One-Hot Encoding to categorical features
    encoder = ColumnTransformer(transformers=[('cat', OneHotEncoder(sparse_output=True), categorical_cols)], remainder='passthrough')
    return encoder.fit_transform(df)


In [None]:
def load_data():
    df = pd.read_csv('beta_dates/beta_data_7_60.csv', index_col=0)
    y = pd.read_csv('beta_dates/true_labels.csv', index_col=0).values[:,0]
    y = y + 1
    X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

    return X_train, X_test, y_train, y_test


# XGBoost Grid Search

In [None]:
X_train, X_test, y_train, y_test = load_data()
# create a pipeline for XGBoost, FAMD, Random Forests, and Multiclass logistic regression
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
        ])

# create a parameter grid for the pipeline

param_grid = {
        'svd__n_components': [3,4,5],
        'xgb__n_estimators': [80, 110, 200],  # Reduced number of trees
        'xgb__learning_rate': [0.01, 0.5, 0.1, 1],  # Expanded range with a lower bound
        'xgb__max_depth': [3,4],  # Shallower trees considering fewer components
        'xgb__min_child_weight': [2, 4, 7],  # Adjusted values for instance weight
        'xgb__gamma': [0.3, 0.5],  # Slightly expanded range for loss reduction
        'xgb__subsample': [0.5, 0.7, 0.8],  # Adjusted subsample ratio
        'xgb__colsample_bytree': [0.8, 1],  # Adjusted subsample ratio of columns
        'xgb__reg_alpha': [1,2,3],  # Adjusted L1 regularization term
        'xgb__reg_lambda': [1,2,3],  # Adjusted L2 regularization term
        }

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)

# Build the best model

In [None]:
X_train, X_test, y_train, y_test = load_data()
# create a pipeline for XGBoost, FAMD, Random Forests, and Multiclass logistic regression
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('xgb', xgb.XGBClassifier(objective='multi:softmax', random_state=42)),
        ])

# create a parameter grid for the pipeline

param_grid = {
'svd__n_components': 5,
 'xgb__colsample_bytree': 0.8, 
 'xgb__gamma': 0.5, 
 'xgb__learning_rate': 0.1,
   'xgb__max_depth': 3, 
   'xgb__min_child_weight': 2, 
   'xgb__n_estimators': 200, 
   'xgb__reg_alpha': 1, 
   'xgb__reg_lambda': 2, 
   'xgb__subsample': 0.7}


pipe.fit(X_train, y_train).score(X_test, y_test)

pipe.set_params(**param_grid).fit(X_train, y_train).score(X_test, y_test)

# Random Forest Grid search

In [None]:
X_train, X_test, y_train, y_test = load_data()
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('rf', RandomForestClassifier()),
])

param_grid = {
        'svd__n_components': [3,4,5],
        'rf__n_estimators': [50, 100, 200],  # Number of trees in the forest
        'rf__max_depth': [10, 20, 30],  # Maximum depth of each tree
        'rf__min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
        'rf__min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
        'rf__max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
}

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)

# Logistic Regression Grid Search

In [None]:
pipe = Pipeline([
        ('svd', TruncatedSVD()),
        ('logreg', LogisticRegression()),
])

param_grid = {
        'svd__n_components': [3,4,5],
        'logreg__C': [0.1, 1, 10, 100],
        'logreg__penalty': ['l1', 'l2'],
        'logreg__multi_class': ['ovr', 'multinomial'],
        'logreg__solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
}

# perform grid search

grid_search = GridSearchCV(estimator=pipe, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# train model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

# Predictions
predictions = best_model.predict(X_test)

# Evaluation
print(classification_report(y_test, predictions))

print(grid_search.best_params_)
print(grid_search.best_score_)

print(grid_search.cv_results_)
