In [26]:
%%capture
!pip install scipy==1.6.0

In [27]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [28]:
%cd drive/MyDrive/graphite/keyword_category_prediction_api

[Errno 2] No such file or directory: 'drive/MyDrive/graphite/keyword_category_prediction_api'
/content/drive/MyDrive/graphite/keyword_category_prediction_api


In [29]:
# Data loading and preprocessing
from modeling.baseline_models.preprocessing import get_and_preprocess_data
from modeling.baseline_models.training_and_testing import train_models, set_model_and_vectorizer_params, \
    save_trained_models
# Vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# Models
from sklearn.linear_model import LogisticRegression
from modeling.baseline_models.training_and_testing import load_hyperparams
# Other
import time
import json
import warnings

In [30]:
!pip list -v | grep scipy

scipy                         1.6.0          /usr/local/lib/python3.7/dist-packages pip


# Train

In [45]:
model_name = 'Logistic Regression'
hyperparams_path = 'logistic_regression'
default = True
sampling = .05
w = False

In [46]:
if w is False:
    warnings.filterwarnings("ignore")

In [40]:
# Get model and vectorizer hyperparameters
with open(f"training_and_testing/{hyperparams_path}/hyperparameters.json") as json_file:
  model_params = json.load(json_file)

## Model definition

In [41]:
# Model definition
if default:
  print('Using default model hyperparameters', flush=True)
  model_params_dict = model_params['DEFAULT']['MODEL']
  print('Using default vectorizer parameters', flush=True)
  vectorizer_params_dict = model_params['DEFAULT']['VECTORIZER']['PARAMS']
else:  # @TODO: add loading of optimized hyperparameters
  print('Using optimal model hyperparameters', flush=True)
  print('Using optimal vectorizer parameters', flush=True)

Using default model hyperparameters
Using default vectorizer parameters


## Data fetching and preprocessing

In [42]:
# Data fetching and preprocessing, and basic set up
pd_train_dict, pd_test_dict, label_columns = get_and_preprocess_data(train=True, test=True, sampling=sampling)
models_and_params = {model_name: pd_train_dict}
hyperparams = load_hyperparams(model_params_dict=model_params_dict,
                               vectorizer_params_dict=vectorizer_params_dict,
                               label_columns=label_columns,
                               default=default)

## Set model, model hyperparameters, vectorizer and vectorizer parameters. 

In [43]:
# Set model, model hyperparameters, vectorizer and vectorizer parameters
print('Setting up model', flush=True)
models_and_params = set_model_and_vectorizer_params(hyperparams=hyperparams,
                                                    models_and_params=models_and_params,
                                                    label_columns=label_columns,
                                                    model=LogisticRegression,
                                                    model_name=model_name,
                                                    vectorizer=eval(model_params['DEFAULT']['VECTORIZER']['NAME']))


Setting up model


## Train al categories

In [44]:
# Train categories
start_time = time.time()
print('Training model', flush=True)
models_and_params = train_models(models_and_params=models_and_params, model_name=model_name)

Training model
Model: Logistic Regression, Category: Health - Training done
Model: Logistic Regression, Category: Vehicles - Training done
Model: Logistic Regression, Category: Hobbies & Leisure - Training done
Model: Logistic Regression, Category: Food & Groceries - Training done
Model: Logistic Regression, Category: Retailers & General Merchandise - Training done
Model: Logistic Regression, Category: Arts & Entertainment - Training done
Model: Logistic Regression, Category: Jobs & Education - Training done
Model: Logistic Regression, Category: Law & Government - Training done
Model: Logistic Regression, Category: Home & Garden - Training done
Model: Logistic Regression, Category: Finance - Training done
Model: Logistic Regression, Category: Computers & Consumer Electronics - Training done
Model: Logistic Regression, Category: Internet & Telecom - Training done
Model: Logistic Regression, Category: Sports & Fitness - Training done
Model: Logistic Regression, Category: Dining & Nightli

In [11]:
print("The model trained in: %s seconds" % (time.time() - start_time), flush=True)

The model trained in: 35.86489939689636 seconds


## Save models

In [12]:
# Save models
print('Saving model', flush=True)
save_trained_models(models_and_params=models_and_params, label_columns=label_columns)

Saving model
Model 'logistic_regression' for category 'Health' saved
Vectorizer for model 'logistic_regression' for category 'Health' saved
Model 'logistic_regression' for category 'Vehicles' saved
Vectorizer for model 'logistic_regression' for category 'Vehicles' saved
Model 'logistic_regression' for category 'Hobbies & Leisure' saved
Vectorizer for model 'logistic_regression' for category 'Hobbies & Leisure' saved
Model 'logistic_regression' for category 'Food & Groceries' saved
Vectorizer for model 'logistic_regression' for category 'Food & Groceries' saved
Model 'logistic_regression' for category 'Retailers & General Merchandise' saved
Vectorizer for model 'logistic_regression' for category 'Retailers & General Merchandise' saved
Model 'logistic_regression' for category 'Arts & Entertainment' saved
Vectorizer for model 'logistic_regression' for category 'Arts & Entertainment' saved
Model 'logistic_regression' for category 'Jobs & Education' saved
Vectorizer for model 'logistic_regr

## Test all categories

In [13]:
start_time = time.time()
pd_avg_precision_results_lr, pd_auc_roc_results_lr = test_models(pd_data=pd_test_dict, models_and_params=models_and_params, 
                                                           model_name=model_name, label_columns=label_columns)

NameError: ignored

In [14]:
print("--- %s seconds ---" % (time.time() - start_time), flush=True)

--- 0.1678164005279541 seconds ---


## Results

In [19]:
pd_avg_precision_results_lr.mean()

Logistic Regression    0.182386
dtype: float64

In [20]:
pd_avg_precision_results_lr

Unnamed: 0,Logistic Regression
Average precision - Health,0.137936
Average precision - Vehicles,0.267105
Average precision - Hobbies & Leisure,0.165182
Average precision - Food & Groceries,0.143038
Average precision - Retailers & General Merchandise,0.076143
Average precision - Arts & Entertainment,0.287756
Average precision - Jobs & Education,0.241253
Average precision - Law & Government,0.076274
Average precision - Home & Garden,0.15056
Average precision - Finance,0.146048
