In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np


import nltk 
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from tqdm import tqdm

import re

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Read Data

In [2]:
df = pd.read_csv('../input/sentiment140/training.1600000.processed.noemoticon.csv',
                 encoding = 'latin',header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text']
df.head()

Unnamed: 0,sentiment,id,date,query,user_id,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
df = df.drop(['id', 'date', 'query', 'user_id'], axis=1)

In [4]:
df.sentiment = df.sentiment.apply(lambda x: 0 if x == 0 else 1)
df.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


## Text preprocessing

Создадим словарь сокращений разговорной речи

In [5]:
abbreviation_dict = {
    'lol': 'laugh out loud',
    'brb': 'be right back',
    'omg': 'oh my god',
    'thx': 'thanks',
    'hbd': 'happy birthday',
    'ily': 'i love you',
    'pls': 'please',
    'ppl': 'people',
    'asap': 'as soon as possible',
    'otw': 'om the way',
    'icymi': 'in case you missed it',
    'tmi': 'too much information',
    'idk': "i don't know",
    'imo': 'in my opinion',
    'imho': 'in my humble opinion',
}

In [6]:
def expand_abbreviations(text):
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in abbreviation_dict.keys()) + r')\b')
    expanded_text = pattern.sub(lambda x: abbreviation_dict[x.group()], text)
    return expanded_text

Напишем функцию полной предобработки текста

In [7]:
from nltk.stem.porter import PorterStemmer

In [8]:
stop_words = set(stopwords.words('english'))
text_cleaning_re = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stemmer = PorterStemmer()

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(text_cleaning_re, ' ', text).strip()
    text = expand_abbreviations(text)
    text = " ".join([stemmer.stem(word) for word in text.split() if word not in stop_words])
    return text

In [10]:
df['text_prepared'] = df.text.apply(lambda x: preprocess_text(x))

In [11]:
df.text_prepared[19]

'oh dear drink forgotten tabl drink'

In [12]:
X = np.array(df['text_prepared'])
y = np.array(df['sentiment'])

## Modeling

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2)
print("Train Data size:", len(X_train))
print("Test Data size", len(X_test))

Train Data size: 1280000
Test Data size 320000


### LogReg

In [50]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

In [55]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_text_vec = vectorizer.transform(X_test)

In [56]:
model_logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
model_logreg.fit(X_train_vec, y_train)

In [59]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [61]:
y_pred = model.predict(X_text_vec)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.77300625
Confusion Matrix:
 [[119832  40168]
 [ 32470 127530]]


### LogReg with TF-IDF

In [63]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [69]:
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_text_tfidf = vectorizer_tfidf.transform(X_test)

In [70]:
model_tfidf = LogisticRegression(solver='lbfgs', max_iter=1000)
model_tfidf.fit(X_train_tfidf, y_train)

In [71]:
y_pred = model_tfidf.predict(X_text_tfidf)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.774915625
Confusion Matrix:
 [[120882  39118]
 [ 32909 127091]]


### LSTM

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from collections import Counter

In [10]:
class MyTokenizer():
    def __init__(self, max_words):
        self.max_words = max_words
    
    def fit(self, texts):
        words = Counter()
        for text in texts:
            for word in text.split():
                words[word] += 1
        top_words = sorted(words.items(), key=lambda item: item[1], reverse=True)[:self.max_words]
        
        self.vocab = {word[0]: i+1 for i, word in enumerate(top_words)}
        self.vocab['<PAD>'] = 0
        self.vocab['<UNK>'] = self.max_words+1
        self.decode_vocab = {v: k for k, v in self.vocab.items()}

    def token_to_id(self, token):
        if token in self.vocab:
            return self.vocab[token]
        else:
            return self.vocab['<UNK>']
    
    def encode(self, text):
        arr = []
        for word in text.split():
            if word in self.vocab:
                arr.append(self.vocab[word])
            else:
                arr.append(self.vocab['<UNK>'])
        return arr
    
    def decode(self, sequence):
        arr = []
        for token in sequence:
            if token in self.decode_vocab:
                arr.append(self.decode_vocab[token])
            else:
                arr.append('<UNK>')
        return ' '.join(arr)
        

In [16]:
tokenizer = MyTokenizer(10000)

In [17]:
tokenizer.fit(X_train)

In [18]:
tokenizer.decode(tokenizer.encode('oh dear drink forgotten tabl drink'))

'oh dear drink forgotten tabl drink'

In [19]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=25):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoded_text = self.tokenizer.encode(text)
        if len(encoded_text) > self.max_len:
            encoded_text = encoded_text[:self.max_len]
        else:
            encoded_text += [self.tokenizer.token_to_id("<PAD>")] * (self.max_len - len(encoded_text))
        return torch.tensor(encoded_text), torch.tensor(label)

In [20]:
train_dataset = TextDataset(X_train, y_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TextDataset(X_test, y_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [20]:
class TextLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, text, text_lengths):
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)
        hidden = hidden[-1,:,:]
        return self.fc(hidden)

In [26]:
INPUT_DIM = len(tokenizer.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
PAD_IDX = tokenizer.token_to_id("<pad>")

model_lstm = TextLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, PAD_IDX)

In [32]:
def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        texts, labels = batch
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        predictions = model(texts, torch.sum(texts != PAD_IDX, dim=1).cpu())
        loss = criterion(predictions.squeeze(1), labels.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [33]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            texts, labels = batch
            texts, labels = texts.to(device), labels.to(device)
            predictions = model(texts, torch.sum(texts != PAD_IDX, dim=1).cpu())
            loss = criterion(predictions.squeeze(1), labels.float())
            epoch_loss += loss.item()
            predicted = torch.round(torch.sigmoid(predictions.squeeze(1)))
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return epoch_loss / len(dataloader), correct / total

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [73]:
optimizer = optim.Adam(model_lstm.parameters())
criterion = nn.BCEWithLogitsLoss()
model_lstm = model_lstm.to(device)
criterion = criterion.to(device)

In [74]:
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model_lstm, train_dataloader, optimizer, criterion, device)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {train_loss:.4f}')

    test_loss, test_accuracy = evaluate_model(model_lstm, test_dataloader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

100%|██████████| 80000/80000 [07:19<00:00, 181.94it/s]


Epoch 1/5, Training Loss: 0.4916


100%|██████████| 20000/20000 [00:49<00:00, 403.34it/s]


Test Loss: 0.4541, Test Accuracy: 0.7840


100%|██████████| 80000/80000 [07:18<00:00, 182.42it/s]


Epoch 2/5, Training Loss: 0.4428


100%|██████████| 20000/20000 [00:49<00:00, 405.34it/s]


Test Loss: 0.4494, Test Accuracy: 0.7866


100%|██████████| 80000/80000 [07:19<00:00, 182.15it/s]


Epoch 3/5, Training Loss: 0.4311


100%|██████████| 20000/20000 [00:49<00:00, 405.92it/s]


Test Loss: 0.4509, Test Accuracy: 0.7880


100%|██████████| 80000/80000 [07:19<00:00, 182.08it/s]


Epoch 4/5, Training Loss: 0.4255


100%|██████████| 20000/20000 [00:49<00:00, 402.32it/s]


Test Loss: 0.4494, Test Accuracy: 0.7872


100%|██████████| 80000/80000 [07:20<00:00, 181.76it/s]


Epoch 5/5, Training Loss: 0.4215


100%|██████████| 20000/20000 [00:49<00:00, 405.20it/s]

Test Loss: 0.4515, Test Accuracy: 0.7865





## Other Preproc

In [11]:
from nltk.tokenize import word_tokenize

In [12]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(text_cleaning_re, ' ', text).strip()
    text = expand_abbreviations(text)
    text = " ".join([word for word in word_tokenize(text)])
    return text

In [13]:
df['text_prepared'] = df.text.apply(lambda x: preprocess_text(x))

In [14]:
df.text_prepared[19]

'oh dear were you drinking out of the forgotten table drinks'

In [15]:
X = np.array(df['text_prepared'])
y = np.array(df['sentiment'])

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.2)

In [17]:
tokenizer = MyTokenizer(10000)
tokenizer.fit(X_train)

In [23]:
tokenizer.decode(tokenizer.encode('oh dear were you drinking out of the forgotten table drinks'))

'oh dear were you drinking out of the forgotten table drinks'

In [24]:
train_dataset = TextDataset(X_train, y_train, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

test_dataset = TextDataset(X_test, y_test, tokenizer)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [27]:
model_lstm_2 = TextLSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, PAD_IDX)

In [30]:
optimizer_2 = optim.Adam(model_lstm_2.parameters())
criterion = nn.BCEWithLogitsLoss()
model_lstm_2 = model_lstm_2.to(device)
criterion = criterion.to(device)

In [36]:
NUM_EPOCHS = 5
for epoch in range(NUM_EPOCHS):
    train_loss = train_model(model_lstm_2, train_dataloader, optimizer_2, criterion, device)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {train_loss:.4f}')

    test_loss, test_accuracy = evaluate_model(model_lstm_2, test_dataloader, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

100%|██████████| 80000/80000 [07:59<00:00, 166.91it/s]


Epoch 1/5, Training Loss: 0.4295


100%|██████████| 20000/20000 [00:53<00:00, 371.57it/s]


Test Loss: 0.3938, Test Accuracy: 0.8207


100%|██████████| 80000/80000 [07:59<00:00, 166.88it/s]


Epoch 2/5, Training Loss: 0.3796


100%|██████████| 20000/20000 [00:53<00:00, 377.06it/s]


Test Loss: 0.3864, Test Accuracy: 0.8253


100%|██████████| 80000/80000 [07:57<00:00, 167.44it/s]


Epoch 3/5, Training Loss: 0.3680


100%|██████████| 20000/20000 [00:53<00:00, 376.08it/s]


Test Loss: 0.3886, Test Accuracy: 0.8239


100%|██████████| 80000/80000 [08:02<00:00, 165.95it/s]


Epoch 4/5, Training Loss: 0.3608


100%|██████████| 20000/20000 [00:53<00:00, 374.87it/s]


Test Loss: 0.3851, Test Accuracy: 0.8263


100%|██████████| 80000/80000 [08:00<00:00, 166.56it/s]


Epoch 5/5, Training Loss: 0.3561


100%|██████████| 20000/20000 [00:53<00:00, 376.02it/s]

Test Loss: 0.3848, Test Accuracy: 0.8268





## BERT

In [18]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [19]:
tokenizer('oh dear were you drinking out of the forgotten table drinks')

{'input_ids': [101, 9294, 7059, 1127, 1128, 5464, 1149, 1104, 1103, 6278, 1952, 8898, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [20]:
class BertDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=25):
        super(BertDataset, self).__init__()
        
        self.texts = texts
        self.tokenizer = tokenizer
        self.target = labels
        self.max_length = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        
        text1 = self.texts[index]
        
        inputs = self.tokenizer.encode_plus(
            text1 ,
            None,
            pad_to_max_length=True,
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': torch.tensor(self.target[index], dtype=torch.long)
            }

bert_dataset_train = BertDataset(X_train, y_train, tokenizer, max_len=50)
bert_dataset_test = BertDataset(X_test, y_test, tokenizer, max_len=50)

bert_dataloader_train = DataLoader(dataset=bert_dataset_train,batch_size=16)
bert_dataloader_test = DataLoader(dataset=bert_dataset_test,batch_size=16)



In [21]:
from transformers import AutoModelForSequenceClassification

In [115]:
class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert_model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased")
#         self.distilbert = bert_model.distilbert
#         self.pre_classifier = bert_model.pre_classifier
#         self.dropout = bert_model.dropout
        self.bert_model.classifier = nn.Linear(768, 1)
        
    def forward(self,ids,mask,token_type_ids):
        out = self.bert_model(ids, attention_mask=mask, return_dict=False)        
        return out
    
model_bert = BERT()
model_bert = model_bert.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [116]:
model_bert

BERT(
  (bert_model): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn):

In [117]:
for param in model_bert.bert_model.distilbert.parameters():
    param.requires_grad = False

In [110]:
def train_model_bert(model, dataloader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch in tqdm(dataloader):
        ids = batch['ids']
        token_type_ids = batch['token_type_ids']
        mask = batch['mask']
        labels = batch['target']
        ids, token_type_ids, mask, labels = ids.to(device), token_type_ids.to(device), mask.to(device), labels.to(device)
        
        optimizer.zero_grad()
        
        predictions = model(ids=ids,
                            mask=mask,
                            token_type_ids=token_type_ids)
        
        loss = criterion(predictions[0].squeeze(1), labels.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(dataloader)

In [111]:
def evaluate_model_bert(model, dataloader, criterion, device):
    model.eval()
    epoch_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in tqdm(dataloader):
            ids = batch['ids']
            token_type_ids = batch['token_type_ids']
            mask = batch['mask']
            labels = batch['target']
            ids, token_type_ids, mask, labels = ids.to(device), token_type_ids.to(device), mask.to(device), labels.to(device)
            predictions = model(ids=ids,
                                mask=mask,
                                token_type_ids=token_type_ids)
            loss = criterion(predictions[0].squeeze(1), labels.float())
            epoch_loss += loss.item()
            predicted = torch.round(torch.sigmoid(predictions[0].squeeze(1)))
            correct += (predicted == labels).sum().item()
            total += labels.size(0)
    return epoch_loss / len(dataloader), correct / total

In [118]:
criterion = nn.BCEWithLogitsLoss()
optimizer_bert= optim.Adam(model_bert.parameters())

In [None]:
NUM_EPOCHS = 3
for epoch in range(NUM_EPOCHS):
    train_loss = train_model_bert(model_bert, bert_dataloader_train, optimizer_bert, criterion, device)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Training Loss: {train_loss:.4f}')

    test_loss, test_accuracy = evaluate_model_bert(model_bert, bert_dataloader_test, criterion, device)
    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

100%|██████████| 80000/80000 [33:00<00:00, 40.40it/s]


Epoch 1/3, Training Loss: 0.6518


100%|██████████| 20000/20000 [08:04<00:00, 41.24it/s]


Test Loss: 0.6291, Test Accuracy: 0.6419


100%|██████████| 80000/80000 [32:59<00:00, 40.42it/s]


Epoch 2/3, Training Loss: 0.6383


100%|██████████| 20000/20000 [08:04<00:00, 41.24it/s]


Test Loss: 0.6238, Test Accuracy: 0.6409


100%|██████████| 80000/80000 [32:59<00:00, 40.42it/s]


Epoch 3/3, Training Loss: 0.6316


 85%|████████▌ | 17055/20000 [06:53<01:11, 41.04it/s]