In [1]:
import re
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import src.helpers as hlp
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer()
from gensim.models import word2vec
from gensim.models import fasttext
import sklearn.decomposition as sk
from sklearn.model_selection import train_test_split
from sklearn import svm
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import torch
import torch.nn as nn
import gensim.downloader as api
from sklearn.metrics import classification_report
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
  "class": algorithms.Blowfish,


In [2]:
VECT_SIZE = 200

In [3]:
t_pos = pd.read_table("data/train_pos.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data/train_neg.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = -1
df = pd.concat((t_pos,t_neg))

In [4]:
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))
df['tweet'] = df['tweet'].apply(lambda s: tweet_tokenizer.tokenize(s))


In [5]:
X_train, X_test, y_train, y_test = train_test_split (df['tweet'], df['label'] , test_size=0.2)

In [6]:
ft_model = fasttext.FastText(sentences=X_train, vector_size=VECT_SIZE, window=5, min_count=1, workers=4)

In [7]:
def vectorize(model_train, dataset):
    words = set(model_train.wv.index_to_key)
    X_train_vect = np.array([np.array([model_train.wv[i] for i in ls if i in words])for ls in dataset])
    X_train_vect_avg = []
    for v in X_train_vect:
        if v.size:
            X_train_vect_avg.append(np.append(v.mean(axis=0),1))
        else:
            X_train_vect_avg.append(np.zeros(VECT_SIZE+1, dtype=float))
    return X_train_vect_avg


In [8]:
ft_X_train_vect = vectorize(ft_model, X_train)
ft_X_test_vect = vectorize(ft_model, X_test)

  X_train_vect = np.array([np.array([model_train.wv[i] for i in ls if i in words])for ls in dataset])


In [9]:
ft_BNB = BernoulliNB()
ft_BNB.fit(ft_X_train_vect, y_train)
y_pred = ft_BNB.predict(ft_X_test_vect)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       0.67      0.52      0.59     19685
           1       0.61      0.75      0.67     19709

    accuracy                           0.63     39394
   macro avg       0.64      0.63      0.63     39394
weighted avg       0.64      0.63      0.63     39394



In [10]:
ft_LSVC = LinearSVC(verbose=1, max_iter=100)
ft_LSVC.fit(ft_X_train_vect, y_train)
y_pred = ft_LSVC.predict(ft_X_test_vect)
print(classification_report(y_test, y_pred))

[LibLinear]              precision    recall  f1-score   support

          -1       0.76      0.65      0.70     19685
           1       0.70      0.80      0.74     19709

    accuracy                           0.73     39394
   macro avg       0.73      0.73      0.72     39394
weighted avg       0.73      0.73      0.72     39394





In [11]:
LRmodel = LogisticRegression(max_iter = 10000, n_jobs=-1)
LRmodel.fit(ft_X_train_vect, y_train)
y_pred3 = LRmodel.predict(ft_X_test_vect)
print(classification_report(y_test, y_pred3))

In [None]:
class TrainDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
    
    def __getitem__(self, index):
        return self.y_data[index], self.X_data[index] 

    def __len__(self):
        return len(self.X_data)

In [None]:
class TestDataset(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
    
    def __getitem__(self, index):
        return self.X_data[index]

    def __len__(self):
        return len(self.X_data)

In [None]:
train_data = TrainDataset(torch.FloatTensor(ft_X_train_vect), torch.LongTensor(y_train.values))
test_data = TrainDataset(torch.FloatTensor(ft_X_test_vect), torch.LongTensor(y_test.values))

In [None]:
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
        train_loss, correct = 0,0
        for y_batch, X_batch in tqdm(dataloader):

                pred = model(X_batch)
                loss = loss_fn(pred, y_batch)

                train_loss += loss.item()
                correct += (pred.argmax(1) == y_batch).type(torch.float).sum().item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        train_loss /= len(dataloader)
        correct /= len(dataloader.dataset)
        print(f"Train Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
        return train_loss, correct

In [None]:
def test_loop(dataloader, model, loss_fn):
    test_loss, correct = 0,0
    with torch.no_grad():
        for y, X in dataloader:

            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= len(dataloader)
    correct /= len(dataloader.dataset)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, correct



In [None]:
def train(model, train_loader, valid_loader, num_epochs, learning_rate):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    losses, train_accs, test_accs = [], [], []
    for epoch in range(num_epochs):
        print(f"Epoch : {epoch+1}\n")
        train_loss, train_acc = train_loop(train_loader, model, loss_fn, optimizer)
        test_loss, test_acc = test_loop(test_loader, model, loss_fn)
        losses.append(float(train_loss))
        train_accs.append(train_acc)
        test_accs.append(test_acc)
    
    plt.title("Training")
    plt.plot(losses, label="Train")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(train_accs, label="Train")
    plt.plot(test_accs, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()



In [None]:
model = nn.Sequential(nn.Linear(VECT_SIZE+1, 100),
                        nn.ReLU(),
                        nn.Dropout(p=0.5),
                        nn.Linear(100, 2),
                        nn.Softmax(1))

In [None]:
train(model, train_loader, test_loader, num_epochs=100, learning_rate=1e-2)

Epoch : 1



100%|██████████| 2463/2463 [00:06<00:00, 405.18it/s]


Train Error: 
 Accuracy: 72.7%, Avg loss: 0.522777 

Test Error: 
 Accuracy: 74.2%, Avg loss: 0.506856 

Epoch : 2



100%|██████████| 2463/2463 [00:06<00:00, 370.55it/s]


Train Error: 
 Accuracy: 73.9%, Avg loss: 0.507226 



KeyboardInterrupt: 