In [1]:
import pandas as pd
import numpy as np
%matplotlib inline

# Additional packages 
import random
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, mean_squared_error, accuracy_score, f1_score, balanced_accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import AdaBoostClassifier
from numpy import mean
from numpy import std
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_predict
from sklearn.datasets import make_classification
###############################################################################
df = pd.read_csv('mle_assignment1_2024.csv')
df.info()

# Create Traning and Test Sets
df_train = df[:3000]
df_test = df[3000:]
n_test = df_test.shape[0]

df_train.info()
df_test.info()

# Create the dictionary variables to save your forecasts
# Make sure to overwrite the arrays with the predicted labels in {-1,1}

Y_pred = dict()
Y_pred['SVM'] = np.zeros((n_test,)) # Support Vector Machine
Y_pred['NN'] = np.zeros((n_test,))  # Shallow Neural Network
Y_pred['AB'] = np.zeros((n_test,))  # AdaBoost
Y_pred['AC'] = np.zeros((n_test,))  # Advanced Classifier

ModuleNotFoundError: No module named 'pandas'

# Basic Support Vector Machine

Let's start with a support vector machine as our baseline. Save your forecasts on the test set in `Y_pred['SVM']`.

In [2]:
# Separate X and y in train and test sets
X_train, y_train = df_train.drop(['target'], axis=1), df_train['target']
X_test = df_test.drop(['target'], axis=1)

In [3]:
random.seed(2024) # Set random seed
SVM = LinearSVC(max_iter=1000, loss='squared_hinge', dual=False, random_state=42)
SVM.fit(X_train, y_train)
Y_pred['SVM'] = SVM.predict(X_test); Y_pred['SVM'] # Save the predictions to the 'SVM' column

array([-1.,  1., -1., ...,  1.,  1.,  1.])

In [4]:
# Perform grid-search to find the optimal value of C and number of max iterations, and evaluate the optimized model

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'f1': make_scorer(f1_score)
}

# Define parameter grid for C and max_iter
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'max_iter': [6,10,20,100,500, 1000, 2000],
}

# Initialize the model
SVM = LinearSVC(loss='squared_hinge', dual=False, random_state=42)

# Set up grid search 
grid_search = GridSearchCV(estimator=SVM, param_grid=param_grid, scoring=scoring, refit='accuracy', cv=5, verbose=0, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Extract the best parameters and scores
best_C = grid_search.best_params_['C']
best_max_iter = grid_search.best_params_['max_iter']
print(f"Best C: {best_C}")
print(f"Best Number of iterations: {best_max_iter}")

# Results for each scoring metric
print("\nCross-Validation Results for the Best Model:")
print(f"Best Accuracy: {grid_search.cv_results_['mean_test_accuracy'][grid_search.best_index_]:.2f}")
print(f"Best Balanced Accuracy: {grid_search.cv_results_['mean_test_balanced_accuracy'][grid_search.best_index_]:.2f}")
print(f"Best F1 Score: {grid_search.cv_results_['mean_test_f1'][grid_search.best_index_]:.2f}")

Best C: 0.001
Best Number of iterations: 6

Cross-Validation Results for the Best Model:
Best Accuracy: 0.62
Best Balanced Accuracy: 0.62
Best F1 Score: 0.65


# Basic Shallow Neural Network (one-hidden-layer neural network )

You can just use `Keras` to train your Network. Save your forecasts on the test set in `Y_pred['NN']`.

In [5]:
# Indicate we want to use a sequential neural network
model = Sequential()

# Add hidden layer
model.add(Dense(units=2, activation='sigmoid')) 

# Add output layer
model.add(Dense(units=1, activation='tanh'))

# Compile model
model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])       

# Fit
model_fit = model.fit(X_train, y_train, epochs=1000, batch_size=400, verbose=0)       

# Save the predictions using test dataset
y_hat = model.predict(X_test) 



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [6]:
# Assign -1 for negative predictions, 1 for positive predictions, and randomly choose -1 or 1 if the prediction is exactly 0
Y_pred_NN = np.where(y_hat < 0, -1, np.where(y_hat > 0, 1, np.random.choice([-1, 1]))) 

# Make sure Y_pred_NN is a 1D array of shape (n,)
Y_pred_NN = Y_pred_NN.flatten()

# Save results to the 'NN' column
Y_pred['NN'] = Y_pred_NN; Y_pred['NN']

array([-1,  1, -1, ...,  1,  1,  1])

In [7]:
# Perform grid-search to find the optimal optimizer, activation function, batch size, and number of epochs 
# and evaluate the optimized model

# Convert to NumPy arrays (no need for .values)
X_train = X_train.values
y_train = y_train.values

# Define function to create the Keras model
def create_model(optimizer='rmsprop', activation='tanh'):
    model = Sequential()
    model.add(Dense(units=1, activation=activation))
    model.add(Dense(units=1, activation=activation))
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# Wrap the Keras model for use in scikit-learn
model = KerasClassifier(build_fn=create_model, epochs=150, batch_size=10, verbose=0)

# Define the hyperparameter grid
param_grid = {
    'optimizer': ['rmsprop', 'adam'],
    'activation': ['tanh', 'relu'],
    'batch_size': [10, 20],
    'epochs': [100, 150]
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring metrics for grid search
scoring = {
    'accuracy': 'accuracy',
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'f1': make_scorer(f1_score)
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1, refit='accuracy')
grid_result = grid_search.fit(X_train, y_train)

# Extract the best model
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_result.best_params_)

# Retrieve and print the best scores for accuracy, balanced accuracy, and F1
best_index = grid_result.best_index_
best_accuracy = grid_result.cv_results_['mean_test_accuracy'][best_index]
best_balanced_accuracy = grid_result.cv_results_['mean_test_balanced_accuracy'][best_index]
best_f1_score = grid_result.cv_results_['mean_test_f1'][best_index]

print("\nCross-Validation Results for the Best Model:")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best Balanced Accuracy: {best_balanced_accuracy:.4f}")
print(f"Best F1 Score: {best_f1_score:.4f}")

Best Hyperparameters: {'activation': 'tanh', 'batch_size': 20, 'epochs': 150, 'optimizer': 'adam'}

Cross-Validation Results for the Best Model:
Best Accuracy: 0.6207
Best Balanced Accuracy: 0.6181
Best F1 Score: 0.6485


# Basic AdaBoost

Save your forecasts on the test set in `Y_pred['AB']`.

In [8]:
# Changing back values in the training set from 0 to -1
y_train[y_train == 0] = -1 

# Define the number of estimators to start with
n_estimators = 500

# Initialize a model
AdaB = AdaBoostClassifier(n_estimators = n_estimators, learning_rate = 1, random_state = 42)

# Fit the model to training data
AdaB.fit(X_train,y_train.ravel())

AdaBoostClassifier(learning_rate=1, n_estimators=500, random_state=42)

In [9]:
Y_pred['AB']= AdaB.predict(X_test); Y_pred['AB'] # Save results to the 'AB' column

array([-1.,  1., -1., ...,  1., -1., -1.])

In [10]:
# Perform grid-search to find the optimal number of estimators and learning rate, and evaluate the optimized model

# Define X and y
X = X_train
y = y_train

# Define the model
model = AdaBoostClassifier()

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [10, 50, 75, 100, 200, 300, 400, 500],
    'learning_rate': [0.0001, 0.001, 0.01, 0.05, 0.1, 1.0]
}

# Define cross-validation strategy
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Define scoring metrics for grid search
scoring = {
    'accuracy': 'accuracy',
    'balanced_accuracy': make_scorer(balanced_accuracy_score),
    'f1': make_scorer(f1_score)
}

# Perform grid search for hyperparameter tuning
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=cv, n_jobs=-1, refit='accuracy')
grid_result = grid_search.fit(X, y)

# Extract the best model
best_model = grid_result.best_estimator_

# Print the best hyperparameters
print("Best Hyperparameters:", grid_result.best_params_)

# Retrieve and print the best scores for accuracy, balanced accuracy, and F1
best_index = grid_result.best_index_
best_accuracy = grid_result.cv_results_['mean_test_accuracy'][best_index]
best_balanced_accuracy = grid_result.cv_results_['mean_test_balanced_accuracy'][best_index]
best_f1_score = grid_result.cv_results_['mean_test_f1'][best_index]

print("\nCross-Validation Results for the Best Model:")
print(f"Best Accuracy: {best_accuracy:.4f}")
print(f"Best Balanced Accuracy: {best_balanced_accuracy:.4f}")
print(f"Best F1 Score: {best_f1_score:.4f}")

Best Hyperparameters: {'learning_rate': 0.05, 'n_estimators': 300}

Cross-Validation Results for the Best Model:
Best Accuracy: 0.6560
Best Balanced Accuracy: 0.6515
Best F1 Score: 0.6914


# Advanced Classifier

**Choose one** of the above basic classifiers (which seems most promising to you) and optimize as much as possible. You can think of optimizing hyper parameters/regularization using (cross-)validation.

Please  make  sure  all  steps  are  well  motivated  and  presented  in  a  clear and structured way.

Save your forecasts on the test set in `Y_pred['AC']`.

In [11]:
n_estimators=300 # Optimized value

# Initialize a model
AdaB = AdaBoostClassifier(
    n_estimators=n_estimators, 
    learning_rate=0.05, # Optimized value
    random_state=42
)

AdaB.fit(X_train,y_train.ravel())

Y_pred['AC'] = AdaB.predict(X_test); Y_pred['AC'] # Save results to the 'AC' column

array([-1.,  1.,  1., ...,  1.,  1.,  1.])

**Explain** why you have chosen this classifier and how you improved the basic model(s).

Your answer: We evaluated three models — BSVM, NN, and AdaBoost — using accuracy, balanced accuracy, and F1 score as metrics, and employed cross-validation to assess their performance. Our results show that AdaBoost performed the best, with an accuracy of 0.65, balanced accuracy of 0.65, and an F1 score of 0.69. To further improve AdaBoost, we optimized its hyperparameters (number of estimators and learning rate) using grid search with cross-validation. We found that the optimized values were 300 estimators and a learning rate of 0.05. We then trained the Advanced Classifier using these optimized parameters.

# Finally, Save Your Estimates to a CSV File

Save estimates for the all $n_{test}=2000$ observations in the test set:
- Make sure each target estimate takes only the value -1 or 1
- You receive a zero mark on the model(s) for which your estimates contain missing value(s).
- Of course, you can't determine the accuracy
- Save your estimates in the dictionary `Y_pred` and save them into a csv file
- Your mark depends on how well your estimates are compared to those of other groups.
- Make sure to replace `group_nr' in the filename by your group number

In [12]:
Y_pred_pd=pd.DataFrame.from_dict(Y_pred) # please save your estimates in a pandas series 
Y_pred_pd.to_csv('mle_assignment1_group10.csv',index=False) # replace `group_nr' in the filename by your group number

Please upload both the completed Jupyter Notebook file and the csv file containing your estimates to Canvas (using separate links).