## Load Data, Preprocessing

The dataset we have to feed consist of 'Tweets' according to the kaggle guide,
which means that there sould be lots of 'cooloquial expressions','hashtags','links',etc.

Thus, we have to "clean" this data(I mean, "text") before we feed it to our model.

In [17]:
import pandas as pd

train = pd.read_csv('Data/train.csv')
test = pd.read_csv('Data/test.csv')
train_len = len(train)

all_data = pd.concat([train,test]) # for preprocessing and tokenizing

In [18]:
import re
import string

# Cleaning Functions
def remove_tag(text):
    tag = re.compile(r'@\S+')
    return tag.sub(r'',text)

def remove_URL(text):
    # http:... / https:... / www... 
    url = re.compile(r'https?://\S+|www\.\S+')
    return re.sub(url,'',text)

def remove_html(text):
    # < > / ( )
    html = re.compile(r'<[^>]+>|\([^)]+\)')
    return html.sub(r'',text)

def remove_punct(text):
    # ['!','"','$','%','&',"'",'(',')','*',
    # '+',',','-','.','/',':',';','<','=',
    # '>','?','@','[','\\',']','^','_','`',
    # '{','|','}','~']
    punctuations = list(string.punctuation)
    table = str.maketrans('', '', ''.join(punctuations))
    return text.translate(table)



In [19]:
# from spellchecker import SpellChecker

# spell = SpellChecker()
# def correct_spellings(text):
#     corrected_text = []
#     misspelled_words = spell.unknown(text.split())
#     for word in text.split():
#         if word in misspelled_words:
#             corrected_text.append(spell.correction(word))
#         else:
#             corrected_text.append(word)
#     return " ".join(corrected_text)

In [20]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = set(stopwords.words('english'))

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chanmuzi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
all_data['cleaned'] = all_data['text'].apply(lambda x:remove_tag(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_URL(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_html(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: remove_punct(x))
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: x.lower()) # lowering
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: word_tokenize(x)) # split sentence into words list
# exclude stop words and make them a sentence again
all_data['cleaned'] = all_data['cleaned'].apply(lambda x: ' '.join([word for word in x if word not in stop]))

## Dataset, DataLoader

In [26]:
# Remember we have combined train and test set into one all_data
train_data,test_data = all_data[:train_len],all_data[train_len:]

In [51]:
from torch.utils.data import Dataset
import torch

class TweetDataset(Dataset):
    def __init__(self,df,is_grad,tokenizer):
        self.df = df
        self.is_grad = is_grad
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self,idx):
        text = self.df.loc[idx]['text']

        encoded_dict = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            max_length=84,
            return_tensors='pt',
            return_attention_mask=True,
        )

        if self.is_grad:
            labels = self.df.loc[idx]['target']
            return {'input_ids':encoded_dict['input_ids'].squeeze(),
                    'attention_mask':encoded_dict['attention_mask'].squeeze(),
                    'labels':torch.tensor(labels).unsqueeze(dim=0)}
        else:
            return {'input_ids':encoded_dict['input_ids'],
                    'attention_mask':encoded_dict['attention_mask']}

In [52]:
from transformers import BertTokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_dataset = TweetDataset(train_data,True,tokenizer)

In [57]:
from torch.utils.data import random_split

train_size = int(0.8 * len(train_dataset))
valid_size = len(train_dataset) - train_size

train_dataset,valid_dataset = random_split(train_dataset,[train_size,valid_size])

print(f'{len(train_dataset)} train samples')
print(f'{len(valid_dataset)} valid samples')

6090 train samples
1523 valid samples


In [60]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_dataset,batch_size=32,shuffle=True,pin_memory=True)
valid_dataloader = DataLoader(valid_dataset,batch_size=32,shuffle=False,pin_memory=True)

In [69]:
for i in train_dataloader._get_iterator():
    print(i)
    break

{'input_ids': tensor([[  101,  2067,  1999,  ...,     0,     0,     0],
        [  101,  4826,  2003,  ...,     0,     0,     0],
        [  101,  1996,  2539,  ...,     0,     0,     0],
        ...,
        [  101,  1031, 10651,  ...,     0,     0,     0],
        [  101,  8299,  1024,  ...,     0,     0,     0],
        [  101,  2892,  1011,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [0.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [0.],
        [0.],
        [0.],
        [1.],
        [1.],
        [0.],
        [0.]

In [71]:
temp = i

## Model

In [None]:
configs = {
    'model_name':'bert-base-uncased',
    'num_labels':2,
    'batch_size':32,
    'epochs':10,
}