# Part 3:

In [5]:
import os
import torch
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore')


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Note:<br> In the `q3/data` folder where you put all the files: train, test, etc.. <br> In the `q3/models` folder we will save all the models.

In [6]:
# make sure you have `data` and `models` folders
if not os.path.isdir('q3/data'):
    os.makedirs('q3/data')

if not os.path.isdir('models'):
    os.makedirs('q3/models')

In [8]:
# read the data
df = pd.read_csv('q3/data/train_data_for_students.tsv', sep='\t', header=None)
df.columns = ['id', 'handle', 'tweet', 'date', 'device']
print(f'{df.shape=}')
df.head()


df.shape=(2682, 5)


Unnamed: 0,id,handle,tweet,date,device
0,845974102619906048,realDonaldTrump,Democrats are smiling in D.C. that the Freedom...,2017-03-26 15:21:58,iphone
1,846166053663191040,realDonaldTrump,General Kelly is doing a great job at the bord...,2017-03-27 04:04:42,iphone
2,835814988686233601,realDonaldTrump,"The race for DNC Chairman was, of course, tota...",2017-02-26 13:33:16,android
3,835817351178301440,realDonaldTrump,For first time the failing @nytimes will take ...,2017-02-26 13:42:39,android
4,835916511944523777,realDonaldTrump,"Russia talk is FAKE NEWS put out by the Dems, ...",2017-02-26 20:16:41,android


In [9]:
df.device.value_counts()


android                                                                                1683
iphone                                                                                  755
<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>                      201
<a href="http://www.twitter.com" rel="nofollow">Twitter for BlackBerry</a>               13
<a href="https://about.twitter.com/products/tweetdeck" rel="nofollow">TweetDeck</a>       9
<a href="http://twitter.com/#!/download/ipad" rel="nofollow">Twitter for iPad</a>         4
<a href="http://instagram.com" rel="nofollow">Instagram</a>                               3
<a href="https://periscope.tv" rel="nofollow">Periscope.TV</a>                            2
<a href="http://www.facebook.com/twitter" rel="nofollow">Facebook</a>                     1
Name: device, dtype: int64

There is issue with the label column. There are values that not iphone / android. WTF?

In [10]:
df = df[(df.device == 'iphone') | (df.device == 'android')]
df.device.value_counts()

android    1683
iphone      755
Name: device, dtype: int64

In [11]:
# add numric label column
# android = 1
# iphone = 0

df['label'] = 0
df.loc[df['device'] == 'android', 'label'] = 1

In [12]:
from q3.data_processing import create_folds

# Using StratifiedKfold since the label is not that balanced

NUMBER_OF_FOLDS = 3

df = create_folds(df, label_name='device', num_folds=NUMBER_OF_FOLDS, seed=11)

input df shape : (2438, 6)
Number of folds: 3, total samples (after removing NaN): 2438
fold: 0, num samples: 813
fold: 1, num samples: 813
fold: 2, num samples: 812


In [13]:
from q3.data_processing import preprocess

# remove urls from tweets snice all hte urls with tweeter shortener

df.tweet = df.tweet.apply(preprocess)

In [17]:
# Finding HP

from torch.optim import AdamW
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
from q3.model import Trainer
from tqdm.auto import tqdm


if not os.path.isfile('q3/data/predictions.csv'):
    lr_optios = [5e-5, 5e-4]
    weight_decay_optios = [5e-3, 1e-2, 2e-2]
    preds = df[['id', 'label', 'fold']]
    total = len(lr_optios)*len(weight_decay_optios)*NUMBER_OF_FOLDS
    with tqdm(total=total, desc = "CV steps") as pbar:
        for lr in lr_optios:
            for wd in weight_decay_optios:
                df.loc[:, 'y_pred'] = -1
                for fold in range(NUMBER_OF_FOLDS):
                    output_name = f'lr-{lr}_wd-{wd}'
                    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
                    model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
                    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=wd)

                    trainer = Trainer(data=df,
                                    tokenizer=tokenizer,
                                    model=model,
                                    optimizer=optimizer,
                                    fold = fold,
                                    early_stopping=2,
                                    batch_size=8,
                                    num_epochs=6,
                                    output_name=output_name)


                
                    trainer.train()
                    trainer.final_eval()
                    df.loc[:, 'y_pred'] = trainer.data.loc[:, 'y_pred']

                    pbar.update(1)
                preds.loc[:, output_name] = df.loc[:, 'y_pred']

    preds.to_csv('q3/data/predictions.csv', index=False)

else:
    preds = pd.read_csv('q3/data/predictions.csv')

In [18]:
cols = [c for c in preds.columns if c not in ['id','label', 'fold']]
best_acc = 0
for col in cols:
    acc = sum(preds.label == preds[col]) / len(preds)
    if acc >= best_acc:
        best_acc = acc
        best_col = col

best_lr = float(best_col.split('_wd')[0].split('lr-')[1])
best_wd = float(best_col.split('_wd-')[1])


print(f'best lr: {best_lr} and wd: {best_wd} -> eval_acc: {best_acc:.4f}')


best lr: 5e-05 and wd: 0.01 -> eval_acc: 0.9065


In [19]:
output_name = 'final_train'

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
optimizer = AdamW(model.parameters(), lr=best_lr, weight_decay=best_wd)

trainer = Trainer(data=df,
                tokenizer=tokenizer,
                model=model,
                optimizer=optimizer,
                fold = None,
                early_stopping=3,
                batch_size=8,
                num_epochs=10,
                output_name=output_name,
                only_training=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias', 'pre_classifier

Downloading and preparing dataset csv/default to C:\Users\itama\.cache\huggingface\datasets\csv\default-5e1d49eb6647d482\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\itama\.cache\huggingface\datasets\csv\default-5e1d49eb6647d482\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2438 [00:00<?, ?ex/s]

In [20]:
# If we alrdeay saved a model it will load it otherwise it will train with the best HP
if not os.path.isfile(f'q3/models/{output_name}'):
    trainer.train()

    saved_state = dict(
                 model_state = trainer.model.state_dict(),
                 test_loss = trainer.test_loss,
                 test_acc = trainer.test_acc, 
            )

    torch.save(saved_state, f'q3/models/{output_name}')
else:
    trainer.model.load_state_dict(torch.load(f'q3/models/{output_name}')['model_state'])

#### Make predictions for the test set

In [21]:
dtest = pd.read_csv('q3/data/test_data.tsv', sep='\t', header=None)
dtest.columns = ['id', 'handle', 'tweet', 'date']
len(dtest.id), len(np.unique(dtest.id))

(867, 866)

We can see that we have a duplicate value in our test set. <br>
We will do a workaround to handle this:

In [22]:
v, c = np.unique(dtest.id, return_counts=True)
for id, count in zip (v, c):
    if count > 1:
        break
    
dtest.loc[dtest.id == id, 'id'] = [11111, id]
assert len(dtest.id) == len(np.unique(dtest.id)), 'there is no duplicate id in the dataset'

In [23]:
trainer.final_eval(dtest)

Using custom data configuration default-4acaf8426f67bae1


Downloading and preparing dataset csv/default to C:\Users\itama\.cache\huggingface\datasets\csv\default-4acaf8426f67bae1\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\itama\.cache\huggingface\datasets\csv\default-4acaf8426f67bae1\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/867 [00:00<?, ?ex/s]

eval epoch:   0%|          | 0/109 [00:00<?, ?it/s]

In [24]:
dtest.loc[:, 'device'] = trainer.test_data['y_pred']  # take the label from the trainer
dtest.loc[:, 'device'] = dtest['device'].map({1: 'android', 0: 'iphone'})
dtest.loc[dtest.id == id, 'id'] = id # return the duplicate id


In [25]:
# save the submision file
dummy = pd.read_csv('q3/data/dummy_submission.tsv', sep='\t', header=None)
assert dummy.shape == dtest.shape
dtest.to_csv('q3/data/submission.tsv', sep='\t', header=None, index=False)