In [None]:
from google.colab import drive
drive.mount('/content/drive')
#! git clone https://github.com/eengel7/comparison_NLP_classification_models.git

In [None]:
%cd drive/MyDrive/comparison_NLP_classification_models

In [None]:
! pip install sklearn joblib

In [2]:

from sklearn.multioutput import ClassifierChain
from sklearn.linear_model import LogisticRegression
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import joblib 

from sklearn.metrics import make_scorer, f1_score, label_ranking_average_precision_score
import wandb
import scipy.sparse as sp
from src.preprocessing.get_preprocessed_data import get_preprocessed_data

In [3]:
X_train, X_test, X_val, Y_train, Y_test, Y_val = get_preprocessed_data('logistic_regression', overwrite_data= False, random_seed= 42)        


preprocessed_data.pkl already exists at data/preprocessed/logistic_regression_en_all_levels_val_42.
Data already exists and will not be overwritten.


In [4]:
wandb.login()
model_name = 'logistic_regression'
random_seed = 42 


name_run = f'{model_name}_{random_seed}'
wandb_project = f'multi-label-{model_name}'
wandb_kwargs = {"name": name_run}
# Initialize wandb
run = wandb.init(project=wandb_project, name = name_run)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33meengel7[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Define logistic regression model
base_lr = LogisticRegression(solver='liblinear', random_state = random_seed, max_iter=1000)
# Define ClassifierChain for multi-label task 
clf = ClassifierChain(base_lr, verbose=True)


# Set up hyperparameter grid for tuning
param_grid = {
    'base_estimator__C': [0.01],
    #'penalty': ['l1', 'l2','elasticnet']
}

scorer = make_scorer(f1_score, average='samples')
# Set up grid search with cross-validation

grid_search = GridSearchCV(clf, param_grid, cv=[(X_val.toarray(), Y_val)], scoring=scorer)
grid_search.fit(X_train.toarray(), Y_train)

: 

: 

In [None]:
# Retrieve the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

In [None]:
# Retrieve the best hyperparameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_test = best_model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
print("Test Accuracy:", accuracy_test)

# Log the validation performance of each run during grid search using wandb
results = grid_search.cv_results_
for mean_score, params in zip(results["mean_test_score"], results["params"]):
    wandb.log({"Validation Accuracy": mean_score, "Params": params})
    print("Validation Accuracy: {:.4f} with params {}".format(mean_score, params))

# Log the test accuracy using wandb
wandb.log({"Test Accuracy": accuracy_test})

# Finish the wandb run
wandb.finish()

In [None]:
# Log to wandb

# log the data as an artifact
data_artifact = wandb.Artifact("data", "dataset")
data_artifact.add_file(f"data/preprocessed/logistic_regression_en_all_levels_val_{random_seed}/preprocessed_data.pkl")
run.log_artifact(data_artifact)


# log the data config as an artifact
config_artifact = wandb.Artifact("config", type="config")
config_artifact.add_file(f"data/preprocessed/logistic_regression_en_all_levels_val_{random_seed}/data_args.json")
run.log_artifact(config_artifact)

# log evaluation metrics
f1_score_avg = f1_score(Y_test, predictions, average='samples', zero_division=0)
label_ranking_score = label_ranking_average_precision_score(Y_test, predictions)
f1_score_avg = f1_score(Y_test, predictions, average='samples', zero_division=0) 
f1_score_macro = f1_score(Y_test, predictions, average='macro', zero_division=0)
f1_score_micro = f1_score(Y_test, predictions, average='micro', zero_division=0)
run.log({"test_LRAP": label_ranking_score, "test_f1_score_avg": f1_score_avg, "test_f1_score_macro": f1_score_macro, "test_f1_score_micro": f1_score_micro})

    # finish logging the data logging run
run.finish()

In [None]:
# Define logistic regression model
base_lr = LogisticRegression(solver='sag', random_state = 42, max_iter=1000, penalty = 'l2')
# Define ClassifierChain for multi-label task 
clf = ClassifierChain(base_lr, verbose=True)

# Set up pipeline with vectorizer and classifier
pipeline = Pipeline([
    ('clf', clf)
])

# Set up hyperparameter grid for tuning
param_grid = {
    'clf__base_estimator__C': [0.001, 0.01, 0.1, 1, 10],
}

# Set up grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='f1_micro')

# grid_search.fit(X_train, Y_train)
clf.fit(X_train, Y_train)

# Save the best model locally
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'best_model.joblib')

predictions = best_model.predict(X_test)
joblib.dump(predictions, 'predictions.joblib')