In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ml-project2/scripts')

In [3]:
!pip install wordninja




In [4]:
!pip install git+https://github.com/huggingface/transformers.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-_6dqkupz
  Running command git clone -q https://github.com/huggingface/transformers.git /tmp/pip-req-build-_6dqkupz
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (PEP 517) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.1.0.dev0-cp36-none-any.whl size=1410989 sha256=000dfcfb33df4b9c857473bcc83967bd36f4d015d9bef4b0b0aec1f0d2b8126f
  Stored in directory: /tmp/pip-ephem-wheel-cache-hn1ec7j2/wheels/33/eb/3b/4bf5dd835e865e472d4fc0754f35ac0edb08fe852e8f21655f
Successfully built transformers


In [5]:
from preprocess import preprocess
from helper import *
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Set all parameters :
SEED = 42
NUM_TRAIN_TWEETS = 100000
BATCH_LEN = 20
TRAIN_EPOCHS = 1
TOKEN_LEN = 60  # the maximum is 512
RATIO_TRAIN = 0.9

In [7]:
# Set all the seeds
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [9]:
path_to_tweets = '/content/drive/MyDrive/Colab Notebooks/ml-project2/data/twitter-datasets/'#'../data/twitter-datasets/'
pos_path = path_to_tweets + 'train_pos_full.txt'
neg_path = path_to_tweets + 'train_neg_full.txt'
test_path = path_to_tweets + 'test_data.txt'

In [10]:
df_train = load_train(pos_path, neg_path, NUM_TRAIN_TWEETS)

## Preprocess the data

In [11]:
# Preprocess train data
out_pre_train = path_to_tweets + 'pre_train.csv'
#df_train = preprocess(df_train, out_pre_train)

In [12]:
# Preprocess test data
#out_pre_test = path_to_tweets + 'pre_test.csv.zip'
#l = preprocess(out_pre_test, out_pre_train)

## Tokenize the data

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [14]:
def compute_mask(input_id):
    copy = input_id.clone().detach()
    copy[input_id != 0] = 1
    return copy
    
def tokenize(tweets_df, tokenizer, out_csv_path):
    tweets_df['input_ids'] = tweets_df['tweet'].apply(lambda tweet: torch.LongTensor(tokenizer.encode(tweet))[:TOKEN_LEN])
    tweets_df['attention_mask'] = tweets_df.apply(lambda row: compute_mask(row.input_ids), axis=1)
    tweets_df = tweets_df[['label', 'input_ids', 'attention_mask']]
    tweets_df.to_csv(out_csv_path, index=False)
    return tweets_df

In [15]:
out_token_train = path_to_tweets + 'token_train.csv'
df_train = tokenize(df_train, tokenizer, out_token_train)

Token indices sequence length is longer than the specified maximum sequence length for this model (782 > 512). Running this sequence through the model will result in indexing errors


In [16]:
from torch.nn.utils.rnn import pad_sequence

def add_padding(tweets_df):
    tweets_df['input_ids'] = pad_sequence(tweets_df.input_ids.tolist(), batch_first=True)
    tweets_df['attention_mask'] = pad_sequence(tweets_df.attention_mask.tolist(), batch_first=True)

    return tweets_df

In [17]:
#df_train = add_padding(df_train)

## Create the model

In [18]:
import gc

def get_batch(tweets_df, batchsize, index):
    start_index = batchsize * index
    batch = tweets_df.iloc[start_index:start_index + batchsize].copy()
    return add_padding(batch)

def remove_batch(batch):
    del batch
    gc.collect()

def get_number_of_batch(tweets_df, batchsize):
    return int(len(tweets_df) / batchsize)

In [19]:
def accuracy(model, tweets_df, batchsize):
    correct_count = 0
    model.eval()
    batch_num = get_number_of_batch(tweets_df, batchsize)
    for i in tqdm(range(batch_num - 1)):
        batch = get_batch(tweets_df, batchsize, i)
        prediction = model(to_device_batch(batch.input_ids), attention_mask=to_device_batch(batch.attention_mask))[0]
        prediction = prediction.argmax(axis=1)
        label = to_device_batch(batch.label)
        
        remove_batch(batch)
        correct_count += (prediction == label).float().mean()
    return correct_count / batch_num

def to_device_batch(df):
    return torch.tensor(df.to_list()).to(device)


def fit_model(model, train, validation, batchsize, epochs, optimizer, scheduler):
    for epoch in tqdm(range(epochs)):
        total_loss = 0
        print('epoch', epoch)
        model.train()
        for i in tqdm(range(get_number_of_batch(train, batchsize) - 1), desc="Transfer progress"):
            optimizer.zero_grad()
            batch = get_batch(train, batchsize, i)
            loss, pred = model(to_device_batch(batch.input_ids), attention_mask=to_device_batch(batch.attention_mask), labels=to_device_batch(batch.label))[:2]
            remove_batch(batch)
            total_loss += loss
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
            optimizer.step()
            scheduler.step()
        print('loss', total_loss)
        print('accuracy', accuracy(model, validation, batchsize))

In [20]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased').to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [21]:
train, validation = train_test_split(df_train, train_size=RATIO_TRAIN, random_state=SEED)
total_steps = len(train) * TRAIN_EPOCHS

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = total_steps)

fit_model(model, train, validation, BATCH_LEN, TRAIN_EPOCHS, optimizer, scheduler)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

epoch 0


HBox(children=(FloatProgress(value=0.0, description='Transfer progress', max=8999.0, style=ProgressStyle(descr…


loss tensor(2837.0391, device='cuda:0', grad_fn=<AddBackward0>)


HBox(children=(FloatProgress(value=0.0, max=999.0), HTML(value='')))


accuracy tensor(0.8811, device='cuda:0')



## Create submission

In [22]:
def tokenize_test(tweets_df, tokenizer, out_csv_path):
    tweets_df['input_ids'] = tweets_df['tweet'].apply(lambda tweet: torch.LongTensor(tokenizer.encode(tweet))[:100])
    tweets_df['attention_mask'] = tweets_df.apply(lambda row: compute_mask(row.input_ids), axis=1)
    tweets_df = tweets_df[['Id', 'input_ids', 'attention_mask']]
    tweets_df.to_csv(out_csv_path, index=False)
    return tweets_df

In [23]:
out_csv_path = path_to_tweets + 'token_test.csv'

In [24]:
out_pre_test = path_to_tweets + 'pre_test.csv'

df_test = load_test(test_path)
#df_test = preprocess(df_test, out_pre_test)
df_test = tokenize_test(df_test, tokenizer, out_csv_path)
#df_test = add_padding(df_test)

In [25]:
def change_zero(value):
    if value == 0:
        value = -1
    return value

def get_prediction(model, tweets_df, batchsize, out_path):
    predictions = []
    model.eval()
    batch_num = get_number_of_batch(tweets_df, batchsize)
    for i in tqdm(range(batch_num)):
        batch = get_batch(tweets_df, batchsize, i)
        prediction = model(to_device_batch(batch.input_ids), attention_mask=to_device_batch(batch.attention_mask))[0]
        del batch
        prediction = prediction.argmax(axis=1).tolist()
        predictions += prediction

    tweets_df['Prediction'] = predictions
    tweets_df['Prediction'] = tweets_df['Prediction'].apply(lambda prediction: change_zero(prediction))
    tweets_df = tweets_df[['Id', 'Prediction']]
    tweets_df.to_csv(out_path, index=False)
    return tweets_df[['Id', 'Prediction']]

In [26]:
out_path = path_to_tweets + 'sub.txt'

print(df_test.head())
prediction = get_prediction(model, df_test, BATCH_LEN, out_path)

    Id  ...                                     attention_mask
Id      ...                                                   
1    1  ...  [tensor(1), tensor(1), tensor(1), tensor(1), t...
2    2  ...  [tensor(1), tensor(1), tensor(1), tensor(1), t...
3    3  ...  [tensor(1), tensor(1), tensor(1), tensor(1), t...
4    4  ...  [tensor(1), tensor(1), tensor(1), tensor(1), t...
5    5  ...  [tensor(1), tensor(1), tensor(1), tensor(1), t...

[5 rows x 3 columns]


HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))


