# Train models notebook

Notebook with the code needed to train and store models to disc

## 0. Set-up
Imports and basic set-up

In [4]:
# Library functions
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
#from xgboost import XGBClassifier
#from catboost import CatBoostClassifier
import multiprocessing
import os
import joblib
from tqdm import tqdm

# Our functions
from utils import TextPreprocessor, FeatureGenerator, remove_nan_questions, get_param_grid
# patch_sklearn()  # to speed up scikit-learn

[nltk_data] Downloading package punkt to /home/gcastro/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
_path_folder_quora = "~/Datasets/QuoraQuestionPairs"
MODELS_DIR = "model_artifacts"
SEED = 123

In [6]:
if not os.path.exists(MODELS_DIR):
    os.makedirs(MODELS_DIR)
    print(f"Folder '{MODELS_DIR}' created successfully.")
else:
    print(f"Folder '{MODELS_DIR}' already exists.")

Folder 'model_artifacts' already exists.


## 1. Load data

We load the data and split it into features and labels. It is necessary to drop the `nan` questions

In [7]:
train_df = pd.read_csv(os.path.join(_path_folder_quora, "quora_train_data.csv"))
x_train = train_df.loc[:, ["question1", "question2"]]
y_train = train_df.loc[:, "is_duplicate"]

x_train, y_train = remove_nan_questions(x_train, y_train)

x_train, x_test, y_train, y_test = train_test_split(
        x_train, y_train, test_size=0.05, random_state=SEED)
x_train, x_validation, y_train, y_validation = train_test_split(
        x_train, y_train, test_size=0.05, random_state=SEED)

Split into train and test

## 2. Define the model

### 2.1 Simple solution

Our simple solution performs basic text preprocessing, feature generation with CountVectorizer and Horizontal stacking followed by a logistic regression model.

In [25]:
pipe = Pipeline(
            [('preprocessor', TextPreprocessor(
                to_lower=True
            )),
             ('generator', FeatureGenerator(exts=('cv', ), aggs=('stack', ), extra_features=tuple())),
             ('classifier', LogisticRegression(max_iter=1000, solver="liblinear",random_state=SEED))],
            verbose=True)

pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total=   0.2s
[Pipeline] ......... (step 2 of 3) Processing generator, total=   5.8s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=  30.0s


In [26]:
joblib.dump(pipe, os.path.join(MODELS_DIR, f"simple_solution.pk1"))

['model_artifacts/simple_solution.pk1']

### 2.2 Improved solution

In the improved solution, we will distinguish two main pathways:
- Manual compilation: explicity indicate the preprocessing, feature generation and classifier steps to fit.
- Grid search: try different combinations of parameters.

Actually, if `GRID_SEARCH=True`, the latter path will be followed, which consists in automatically fitting a bunch of different models. However, as this is a brute force approach, the execution of this second approach would take many hours and we did not have the computing power to try all combinations.

In [17]:
GRID_SEARCH: bool = False  # True

In [18]:
if GRID_SEARCH:
    models = {
            "AdaBoostClassifier": AdaBoostClassifier(),
            "RandomForestClassifier": RandomForestClassifier(),
            "LinearDiscriminantAnalysis": LinearDiscriminantAnalysis(),
            "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis(),
            "BernoulliNB": BernoulliNB(),
            "GaussianNB": GaussianNB(),
            "KNeighborsClassifier": KNeighborsClassifier(),
            "SVC": SVC(),
            "LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED),
            "GradientBoostingClassifier": GradientBoostingClassifier()
            #"XGBClassifier": XGBClassifier(n_jobs=multiprocessing.cpu_count() - 1),
            #"CatBoostClassifier": CatBoostClassifier(silent=True),
        }

#### 2.1. Manual compilation (`sklearn.pipeline.Pipeline`)

We will use the architecture we obtained the best results with

In [19]:
if not GRID_SEARCH:
    models = {"LogisticRegression": LogisticRegression(max_iter=1000, random_state = SEED)}

In [20]:
if not GRID_SEARCH:
    pipe = Pipeline(
            [('preprocessor', # TextPreprocessor()
              TextPreprocessor(
                 remove_stop_words = True,
                 remove_punctuation = True,
                 to_lower = True,
                 apply_stemming = True,
                 british = False)
            ),
             ('generator', FeatureGenerator(exts=('cv_2w', 'tf_idf_2w'), aggs=('stack', 'absolute'))),
             ('classifier', models['LogisticRegression'])],
            verbose=True)
    pipe.fit(x_train, y_train)

[Pipeline] ...... (step 1 of 3) Processing preprocessor, total= 2.4min
[Pipeline] ......... (step 2 of 3) Processing generator, total=  40.6s
[Pipeline] ........ (step 3 of 3) Processing classifier, total= 5.4min


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
if not GRID_SEARCH:
    joblib.dump(pipe, f'{MODELS_DIR}/improved_solution.joblib')

#### 2.2. Grid search

For each model, we define a `sklearn.pipeline.Pipeline` to order the transformations and train it using our custom classes

In [22]:
if GRID_SEARCH:
    fitted_models = {}
    scores = {}
    for name, model in tqdm(models.items()):
        # define pipeline given a model
        pipe = Pipeline(
            [('preprocessor', TextPreprocessor()),
             ('generator', FeatureGenerator()),
             ('classifier', model)],
            verbose=True)
        # get grid of parameters to search
        grid = get_param_grid(name, SEED)
        grid_search = GridSearchCV(
            pipe,
            param_grid=grid,
            scoring= "roc_auc",
            cv=2,
            verbose=10,
            n_jobs=multiprocessing.cpu_count() - 1,
            error_score="raise",
        )

        # fit grid search with pipeline and grid
        grid_search.fit(x_train, y_train)

        # save model
        fitted_models[name] = grid_search.best_estimator_
        scores[name] = grid_search.best_score_

        joblib.dump(grid_search, os.path.join(MODELS_DIR, f"fitted_{name}.pk1"))

We save the best model

In [23]:
if GRID_SEARCH:
    best_model_name = max(scores)
    best_model = fitted_models[best_model_name]
    print(f"Best model found in the grid search is {best_model_name}, with a CV score of {scores[best_model_name]:.4f}")
    fitted_pipe = best_model
    joblib.dump(fitted_pipe, f'{MODELS_DIR}/improved_solution.joblib')