In [26]:
import sys
sys.path.insert(0, '../')
%load_ext autoreload
%autoreload 2

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/leonardogavaudan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/leonardogavaudan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/leonardogavaudan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
! ls ../models/

model_emb_binary.joblib model_emb_binary.sav    model_w2v_binary.joblib


# Train Model Locally (without removing)

In [28]:
from green_mood_tracker.data import get_data, clean
from green_mood_tracker.roberta_trainer import RobertaTrainer
from termcolor import colored

In [29]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# Get and clean data
EXPERIMENT = "[GB] [London] [green_mood_tracker] RoBERTa"

params = dict(nrows=100,
              upload=False,
              local=False,
              rm=False,
              mlflow=True,  # set to True to log params to mlflow
              experiment_name=EXPERIMENT
              )

print("############   Loading Data   ############")
df = get_data(**params)
df = clean(df, 'text')
y_train = df.polarity
X_train = df.text
del df
print("shape: {}".format(X_train.shape))
print("size: {} Mb".format(X_train.memory_usage() / 1e6))
# Train and save model, locally and
t = RobertaTrainer(X=X_train, y=y_train, **params)
del X_train, y_train
print(colored("############  Training model   ############", "red"))
t.train()
print(colored("############  Evaluating model ############", "blue"))
t.evaluate()
print(colored("############   Saving model    ############", "green"))
t.save_model(**params)

############   Loading Data   ############
shape: (100,)
size: 0.00088 Mb
[31m############  Training model   ############[0m


Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaForSequenceClassification: ['lm_head']
- This IS expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


train 123.22
[34m############  Evaluating model ############[0m
[34maccuracy train: 0.7346938848495483 || accuracy test: 0.746835470199585[0m
[32m############   Saving model    ############[0m
[32mroBERTa.tf saved locally[0m


# Train Model on GCP

# Upload the Model (without removing)

In [51]:
from green_mood_tracker.gcp import storage_upload_models

In [52]:
storage_upload_models(bucket_name='green-mood-tracker-01', model_name='RoBERTa',
                      model_version='v0', model_filename='roBERTa.tf', rm=False)

Uploading roBERTa.tf!
[32m=> roBERTa.tf uploaded to bucket green-mood-tracker-01 inside models/RoBERTa/v0[0m


# Download the Model from GCP

In [46]:
from green_mood_tracker.gcp import download_model_files

In [50]:
download_model_files(bucket_name='green-mood-tracker-01', model_name='RoBERTa',
                     model_version='test', model_filename='roBERTa.tf')

[32m=> roBERTa.tf downloaded from storage[0m


# Load the model

In [41]:
from green_mood_tracker.gcp import load_model
from predict import generate_prediction, get_test_data

In [42]:
model = load_model(model_name='RoBERTa', model_filename='roBERTa.tf', rm=False)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at models/roBERTa.tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


[32m=> loaded model roBERTa.tf[0m


In [38]:
data = get_test_data()

In [40]:
generate_prediction(data, download_files=False)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

All the layers of TFRobertaForSequenceClassification were initialized from the model checkpoint at models/roBERTa.tf.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


[32m=> loaded model roBERTa.tf[0m


array([0.28979734, 0.30191797, 0.29278377, 0.285399  , 0.29739454,
       0.29063857, 0.29380593, 0.2921131 , 0.28735808, 0.2986038 ,
       0.28978133, 0.29879144, 0.28313825, 0.2995036 , 0.30711553,
       0.30123916, 0.30399767, 0.3074397 , 0.29629147, 0.2955789 ,
       0.31266344, 0.29394478, 0.2926669 , 0.2871198 , 0.3176377 ,
       0.29562467, 0.3011247 , 0.30538484, 0.29414368, 0.27889526,
       0.28754598, 0.30617523, 0.31275502, 0.30208433, 0.311403  ,
       0.30160436, 0.30092445, 0.3151674 , 0.29466775, 0.29442784,
       0.30799472, 0.2906482 , 0.289969  , 0.30760434, 0.30768386,
       0.29333088, 0.2889149 , 0.3040571 , 0.29423466, 0.29803804,
       0.28599605, 0.28270513, 0.2977698 , 0.29553753, 0.30228886,
       0.28845158, 0.3013547 , 0.28848734, 0.2914635 , 0.29367706,
       0.29724872, 0.3052055 , 0.28994074, 0.27883455, 0.2863143 ,
       0.2826571 , 0.305679  , 0.27918622, 0.3002725 , 0.2986722 ,
       0.28175053, 0.29999682, 0.30123734, 0.29506105, 0.29157