In [19]:
import pandas as pd
import yaml

from sklearn.metrics import classification_report

from bert_demo.dataset import split_train_val
from bert_demo.train import train_model
from bert_demo.pipeline import SentimentPipeline, Predictor

import warnings
warnings.filterwarnings('ignore')

import logging
logging.disable(logging.INFO)
logging.disable(logging.WARNING)

### Read data

In [8]:
DATA_PATH = "/artifacts/data/20190110_train_4500.csv"

In [9]:
dataset = pd.read_csv(DATA_PATH)

In [10]:
label_mapping = {"Negative": 0, "Positive": 2, "Neutral": 1}

In [11]:
dataset["label"] = dataset["sentiment"].map(label_mapping)

In [12]:
dataset.head()

Unnamed: 0,title,sentiment,label
0,Bitcoin Market Has Run Out of Juice: Cryptocur...,Negative,0
1,Bitcoin Core 0.14.0 Speeds Up Blockchain Synci...,Positive,2
2,Thinking of Travelling With Bitcoin? With Thes...,Positive,2
3,Investors Carried Out Mental Gymnastics to Jus...,Negative,0
4,"Bitcoin Price Holds Above $8,500 as Market Fig...",Positive,2


### Config

In [13]:
cfg_str = """
epochs: 3
train_batch_size: 32
val_batch_size: 64
seed: 42
checkpoint_path: bert_demo/checkpoints/

model_name: &model_name distilbert-base-uncased

tokenizer:
    class: transformers.DistilBertTokenizer
    params:
        pretrained_model_name_or_path: *model_name
        model_max_length: 50
    call_params:
        truncation: True
        padding: True

model:
    class: transformers.DistilBertForSequenceClassification
    params:
        pretrained_model_name_or_path: *model_name
        num_labels: 3

optimizer:
    class: transformers.AdamW
    params:
        lr: 0.000023
        weight_decay: 0.001

scheduler:
    params:
        name: polynomial
        num_warmup_steps: 0
"""

In [14]:
cfg = yaml.safe_load(cfg_str)

### Split data

In [15]:
train_data, val_data, train_labels, val_labels = split_train_val(dataset, test_size=0.2)

### Train model

In [16]:
# output = train_model(
#     cfg, 
#     train_data,
#     train_labels,
#     val_data,
#     val_labels, 
#     return_predictions=True,
#     return_model=True
# )

In [17]:
# output.keys()

In [18]:
# output["scores"]

In [20]:
# print(classification_report(val_labels, output["pred_labels"], target_names=label_mapping.keys()))

### Inferenece

In [22]:
checkpoint_path = "/artifacts/models/bert.ckpt"

In [25]:
model = SentimentPipeline.load_from_checkpoint(checkpoint_path, cfg=cfg)

In [26]:
predictor = Predictor(model)

In [27]:
predictor([
    "btc does not drop by 10%", 
    "EU Won't Ban Bitcoin After All",
    "BTC Price Tech Analysis for 08/02/2017  Back to Triangle Support"
])

array([[0.14236583, 0.11000765, 0.74762654],
       [0.1488722 , 0.18346722, 0.66766053],
       [0.01112188, 0.87461185, 0.11426619]], dtype=float32)