In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive/graphite/keyword_category_prediction_api

/content/drive/MyDrive/graphite/keyword_category_prediction_api


In [3]:
# Data loading and preprocessing
from modeling.baseline_models.preprocessing import get_and_preprocess_data
from modeling.baseline_models.training_and_testing import train_models, test_models, set_model_and_vectorizer_params, save_trained_models, load_trained_models, build_dummy_dict
# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Models
from sklearn.linear_model import LogisticRegression
from modeling.baseline_models.training_and_testing import load_hyperparams
# Saving and loading models
import joblib
# Other
import time
import json
import warnings
warnings.filterwarnings("ignore")

# Train

In [4]:
with open("training_and_testing/logistic_regression/hyperparameters.json") as json_file:
    lr_params = json.load(json_file)

## Model definition

In [5]:
model_name = 'Logistic Regression'
lr_params_dict = lr_params['DEFAULT']['MODEL']
lr_vectorizer_params_dict = lr_params['DEFAULT']['VECTORIZER']['PARAMS']

## Data fetching and preprocessing

In [6]:
pd_train_dict, pd_test_dict, label_columns = get_and_preprocess_data(train=True, test=True, sampling=.05)

In [7]:
models_and_params = {model_name: pd_train_dict}

In [8]:
hyperparams = load_hyperparams(model_params_dict=lr_params_dict, 
                               vectorizer_params_dict=lr_vectorizer_params_dict, 
                               label_columns=label_columns, 
                               default=True)

## Set model, model hyperparameters, vectorizer and vectorizer parameters. 

In [9]:
models_and_params = set_model_and_vectorizer_params(hyperparams=hyperparams, models_and_params=models_and_params, 
                                                    label_columns=label_columns, model=LogisticRegression, 
                                                    model_name=model_name, vectorizer=eval(lr_params['DEFAULT']['VECTORIZER']['NAME']))

## Train al categories

In [10]:
start_time = time.time()
models_and_params = train_models(models_and_params=models_and_params, model_name=model_name)

Model: Logistic Regression, Category: Health - Training done
Model: Logistic Regression, Category: Vehicles - Training done
Model: Logistic Regression, Category: Hobbies & Leisure - Training done
Model: Logistic Regression, Category: Food & Groceries - Training done
Model: Logistic Regression, Category: Retailers & General Merchandise - Training done
Model: Logistic Regression, Category: Arts & Entertainment - Training done
Model: Logistic Regression, Category: Jobs & Education - Training done
Model: Logistic Regression, Category: Law & Government - Training done
Model: Logistic Regression, Category: Home & Garden - Training done
Model: Logistic Regression, Category: Finance - Training done
Model: Logistic Regression, Category: Computers & Consumer Electronics - Training done
Model: Logistic Regression, Category: Internet & Telecom - Training done
Model: Logistic Regression, Category: Sports & Fitness - Training done
Model: Logistic Regression, Category: Dining & Nightlife - Training d

In [11]:
print("--- %s seconds ---" % (time.time() - start_time), flush=True)

--- 35.613802671432495 seconds ---


## Save models

In [12]:
save_trained_models(models_and_params=models_and_params, label_columns=label_columns)

Model 'logistic_regression' for category 'Health' saved
Vectorizer for model 'logistic_regression' for category 'Health' saved
Model 'logistic_regression' for category 'Vehicles' saved
Vectorizer for model 'logistic_regression' for category 'Vehicles' saved
Model 'logistic_regression' for category 'Hobbies & Leisure' saved
Vectorizer for model 'logistic_regression' for category 'Hobbies & Leisure' saved
Model 'logistic_regression' for category 'Food & Groceries' saved
Vectorizer for model 'logistic_regression' for category 'Food & Groceries' saved
Model 'logistic_regression' for category 'Retailers & General Merchandise' saved
Vectorizer for model 'logistic_regression' for category 'Retailers & General Merchandise' saved
Model 'logistic_regression' for category 'Arts & Entertainment' saved
Vectorizer for model 'logistic_regression' for category 'Arts & Entertainment' saved
Model 'logistic_regression' for category 'Jobs & Education' saved
Vectorizer for model 'logistic_regression' for c

## Test all categories

In [15]:
start_time = time.time()
pd_avg_precision_results_lr, pd_auc_roc_results_lr = test_models(pd_data=pd_test_dict, models_and_params=models_and_params, 
                                                           model_name=model_name, label_columns=label_columns)

Model: Logistic Regression, Category: Health - Testing done
Model: Logistic Regression, Category: Vehicles - Testing done
Model: Logistic Regression, Category: Hobbies & Leisure - Testing done
Model: Logistic Regression, Category: Food & Groceries - Testing done
Model: Logistic Regression, Category: Retailers & General Merchandise - Testing done
Model: Logistic Regression, Category: Arts & Entertainment - Testing done
Model: Logistic Regression, Category: Jobs & Education - Testing done
Model: Logistic Regression, Category: Law & Government - Testing done
Model: Logistic Regression, Category: Home & Garden - Testing done
Model: Logistic Regression, Category: Finance - Testing done
Model: Logistic Regression, Category: Computers & Consumer Electronics - Testing done
Model: Logistic Regression, Category: Internet & Telecom - Testing done
Model: Logistic Regression, Category: Sports & Fitness - Testing done
Model: Logistic Regression, Category: Dining & Nightlife - Testing done
Model: Log

In [16]:
print("--- %s seconds ---" % (time.time() - start_time), flush=True)

--- 3.480882406234741 seconds ---


## Results

In [19]:
pd_avg_precision_results_lr.mean()

Logistic Regression    0.182386
dtype: float64

In [20]:
pd_avg_precision_results_lr

Unnamed: 0,Logistic Regression
Average precision - Health,0.137936
Average precision - Vehicles,0.267105
Average precision - Hobbies & Leisure,0.165182
Average precision - Food & Groceries,0.143038
Average precision - Retailers & General Merchandise,0.076143
Average precision - Arts & Entertainment,0.287756
Average precision - Jobs & Education,0.241253
Average precision - Law & Government,0.076274
Average precision - Home & Garden,0.15056
Average precision - Finance,0.146048
