# Baseline experiments

In [2]:

from sklearn.multioutput import MultiOutputClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.ensemble import RandomForestClassifier 
from sklearn.preprocessing import MultiLabelBinarizer, normalize

from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from config.global_args import GlobalArgs

from src.preprocessing.get_preprocessed_data import get_preprocessed_data
from sklearn import preprocessing
import pandas as pd
import logging
import pickle
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import hamming_loss, accuracy_score, f1_score
import wandb
from sklearn.pipeline import Pipeline
import joblib 

In [3]:
# Get data: Preprocess or 

# Experiment config:
model_type = 'logistic_regression'

get_preprocessed_data(model_type, overwrite_data = True)

#TODO change relative path
# with open('/Users/evaengel/comparison_NLP_classification_models/data/preprocessed_for_logistic_regression/preprocessed_data_en_all_levels.pkl', 'rb') as file:
#     dataset_dict = pickle.load(file)
# X_train, X_test, Y_train, Y_test = dataset_dict["X_train"], dataset_dict["X_test"], dataset_dict["Y_train"], dataset_dict["Y_test"]

Data already exists but will be overwritten.


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/evaengel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Binarizer:data/preprocessed_for_logistic_regression/binarizer_all_levels.joblib
Saving new binarizer to data/preprocessed_for_logistic_regression/binarizer_all_levels.joblib...
305 classes were encoded by MultiLabelBinarizer.
  (0, 715271)	2
  (0, 1335824)	3
  (0, 555680)	4
  (0, 954463)	2
  (0, 322469)	1
  (0, 451989)	2
  (0, 667307)	2
  (0, 1063442)	2
  (0, 835369)	1
  (0, 276706)	1
  (0, 1172485)	1
  (0, 629683)	1
  (0, 371972)	3
  (0, 1462824)	3
  (0, 880396)	3
  (0, 1578491)	3
  (0, 1464807)	1
  (0, 44805)	1
  (0, 1148524)	7
  (0, 386176)	3
  (0, 730676)	2
  (0, 1581964)	2
  (0, 906389)	2
  (0, 1088993)	4
  (0, 884774)	1
  :	:
  (29396, 198386)	1
  (29396, 148197)	1
  (29396, 341847)	1
  (29396, 993228)	1
  (29396, 371360)	1
  (29396, 1081850)	1
  (29396, 128891)	1
  (29396, 148785)	1
  (29396, 1605853)	1
  (29396, 1375090)	1
  (29396, 457862)	3
  (29396, 148188)	1
  (29396, 105849)	3
  (29396, 148847)	1
  (29396, 311480)	1
  (29396, 1538960)	1
  (29396, 1149634)	1
  (29396, 46241

In [17]:
print(X_train.shape, Y_train.shape )

(29397, 1637432) (29397, 305)


# Fit Model with fixed params

In [None]:
# Define logistic regression model
base_lr = LogisticRegression(solver='sag', random_state = GlobalArgs.random_seed, max_iter=1000)
# Define ClassifierChain for multi-label task 
cc_clf = ClassifierChain(base_lr)

In [None]:
# Train logistic regression model with hyperparameter tuning
cc_clf.fit(X_train, Y_train)

# Hypertuning:

- RandomizedSearchCV
- GridSearch


In [20]:
# Define logistic regression model
base_lr = LogisticRegression(solver='sag', random_state = GlobalArgs.random_seed, max_iter=10)
# Define ClassifierChain for multi-label task 
clf = ClassifierChain(base_lr)

# Set up pipeline with vectorizer and classifier
pipeline = Pipeline([
    ('clf', clf)
])

# Set up hyperparameter grid for tuning
param_grid = {
    'clf__base_estimator__C': [0.1, 1, 10],
    'clf__base_estimator__penalty': ['l1', 'l2', 'elasticnet']
}

param_grid = {
    'clf__base_estimator__C': [0.1, 1],
    'clf__base_estimator__penalty': ['l1', 'l2']
}

# Set up grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='f1_micro')


In [21]:
grid_search.fit(X_train, Y_train)



In [1]:
# Save the best model locally
best_model = grid_search.best_estimator_
joblib.dump(best_model, 'outputs/logistic_regression/best_model.joblib')

NameError: name 'grid_search' is not defined

In [None]:
# Set up wandb
wandb.init(project='multi_label_classification')

In [None]:
# Save the best model to wandb
wandb.run.summary['best_model'] = wandb.Artifact('best_model', type='model')
wandb.run.summary['best_model'].add_file('best_model.joblib')
wandb.log_artifact(wandb.run.summary['best_model'])

In [None]:
predicitons = best_model.predict(X_test)

# Evaluation

In [None]:
predicitons = cc_clf.predict(X_test)

In [None]:
f1_average = metrics.f1_score(Y_test, predicitons, average= 'weighted')
print(f1_average)
f1 = metrics.f1_score(Y_test, predicitons)
print(f1)
lrap = metrics.label_ranking_average_precision_score(Y_test, predicitons)

print(lrap)

In [None]:
hamming_loss_score = hamming_loss(Y_test, predicitons)
accuracy_score = accuracy_score(Y_test, predicitons)
f1_score_micro = f1_score(Y_test, predicitons, average='micro')
f1_score_macro = f1_score(Y_test, predicitons, average='macro')

# Log the evaluation metrics to wandb
wandb.log({'hamming_loss': hamming_loss_score,
           'accuracy_score': accuracy_score,
           'f1_score_micro': f1_score_micro,
           'f1_score_macro': f1_score_macro})

# Finish the wandb run
wandb.finish()

In [None]:
# Log the evaluation metrics to wandb
wandb.log({'hamming_loss': hamming_loss_score,
           'accuracy_score': accuracy_score,
           'f1_score_micro': f1_score_micro,
           'f1_score_macro': f1_score_macro})

# Finish the wandb run
wandb.finish()