In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.linear_model import LogisticRegression
from torchvision import datasets
from torchvision.transforms import ToTensor
import torchtext
import numpy as np
import matplotlib.pyplot as plt
import pickle
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import src.helpers as hlp
from sklearn.metrics import classification_report
import contractions

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\danie\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
VECT_DIM = 200

In [3]:
embeddings = torchtext.vocab.GloVe(name="twitter.27B", dim=VECT_DIM, max_vectors=1000000)

In [4]:
t_pos = pd.read_table("data_full/train_pos_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_pos['label'] = 1
t_neg = pd.read_table("data_full/train_neg_full.txt", header=None, names=['tweet'], dtype=str,on_bad_lines='skip')
t_neg['label'] = 0
df = pd.concat((t_pos,t_neg))

In [5]:
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_stopwords(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_punct(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.add_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_white_space(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_words_digits(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.to_lower(x))
df['tweet'] = df['tweet'].apply(lambda x: contractions.fix(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_specific_words(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_repeating_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.remove_single_char(x))
df['tweet'] = df['tweet'].apply(lambda x: hlp.lemmatize(x))

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
bow=CountVectorizer( min_df=1,max_features=100000)
bow.fit(df['tweet'])
tweets_processed =bow.transform(df['tweet'])

In [7]:
X_cnt_train, X_cnt_test, y_cnt_train, y_cnt_test = train_test_split(tweets_processed, df['label'].values, test_size=0.20, random_state=10)

In [8]:
def vectorize(tweets):
    vectorized_tweets_np = np.ones((len(tweets),VECT_DIM+1))
    not_in_embed = 0
    nb_words = 0
    for i, tweet in enumerate(tqdm(tweets)):
        tweet_len = len(tweet.split(' '))
        tweet_embedding=np.zeros((tweet_len,VECT_DIM))
        for j, word in enumerate(tweet.split(' ')):
            nb_words += 1
            tweet_embedding[j] = embeddings[word]
            if torch.equal(embeddings[word],torch.zeros(VECT_DIM)):
                not_in_embed+=1
        vectorized_tweets_np[i, 1:] = tweet_embedding.sum(0)
    print(not_in_embed/nb_words)
    return vectorized_tweets_np

In [9]:
X = vectorize(df.tweet)
y = df['label'].values

100%|██████████| 2458295/2458295 [10:21<00:00, 3957.67it/s]

0.027958802372555207





In [10]:
X.shape

(2458295, 201)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

In [12]:
from sklearn.naive_bayes import BernoulliNB
BNB = BernoulliNB()
BNB.fit(X_train, y_train)

In [13]:
y_pred3 = BNB.predict(X_test)
print(classification_report(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.68      0.64      0.66    248255
           1       0.65      0.70      0.68    243404

    accuracy                           0.67    491659
   macro avg       0.67      0.67      0.67    491659
weighted avg       0.67      0.67      0.67    491659



In [14]:
class TrainDataset(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
    
    def __getitem__(self, index):
        return self.y_data[index], self.X_data[index] 

    def __len__(self):
        return len(self.X_data)

In [15]:
class TestDataset(Dataset):
    def __init__(self, X_data):
        self.X_data = X_data
    
    def __getitem__(self, index):
        return self.X_data[index]

    def __len__(self):
        return len(self.X_data)

In [16]:
y_test = np.where(y_test == -1, 0, y_test)
y_train = np.where(y_train == -1, 0, y_train)

In [17]:
train_data = TrainDataset(torch.FloatTensor(X_train), torch.LongTensor(y_train))
test_data = TrainDataset(torch.FloatTensor(X_test), torch.LongTensor(y_test))

In [18]:
train_loader = DataLoader(dataset=train_data, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=1)

In [19]:
def train_loop(dataloader, model, loss_fn, optimizer):
        train_loss, correct = 0,0
        for y_batch, X_batch in tqdm(dataloader):

                pred = model(X_batch)
                loss = loss_fn(pred, y_batch)
                train_loss += loss.item()
                correct += (pred.argmax(1) == y_batch).type(torch.float).sum().item()

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        train_loss /= len(dataloader)
        correct /= len(dataloader.dataset)
        print(f"Train Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {train_loss:>8f} \n")
        return train_loss, correct

In [20]:
def test_loop(dataloader, model, loss_fn):
    test_loss, correct = 0,0
    with torch.no_grad():
        for y, X in dataloader:

            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= len(dataloader)
    correct /= len(dataloader.dataset)
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
    return test_loss, correct



In [21]:
def train(model, train_loader, valid_loader, num_epochs, learning_rate):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    losses, train_accs, test_accs = [], [], []
    for epoch in range(num_epochs):
        print(f"Epoch : {epoch+1}\n")
        train_loss, train_acc = train_loop(train_loader, model, loss_fn, optimizer)
        test_loss, test_acc = test_loop(test_loader, model, loss_fn)
        losses.append(float(train_loss))
        train_accs.append(train_acc)
        test_accs.append(test_acc)
    
    plt.title("Training")
    plt.plot(losses, label="Train")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    plt.title("Training Curve")
    plt.plot(train_accs, label="Train")
    plt.plot(test_accs, label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend(loc='best')
    plt.show()



In [22]:
model = nn.Sequential(nn.Linear(VECT_DIM+1, 200),
                        nn.Sigmoid(),
                        nn.Dropout(p=0.5),
                        nn.Linear(200, 2),
                        nn.Softmax(1))

In [23]:
train(model, train_loader, test_loader, num_epochs=10, learning_rate=1e-3)

Epoch : 1



100%|██████████| 30729/30729 [01:42<00:00, 298.55it/s]


Train Error: 
 Accuracy: 78.4%, Avg loss: 0.516160 

Test Error: 
 Accuracy: 79.1%, Avg loss: 0.509745 

Epoch : 2



100%|██████████| 30729/30729 [01:29<00:00, 341.46it/s]


Train Error: 
 Accuracy: 79.5%, Avg loss: 0.506380 

Test Error: 
 Accuracy: 79.6%, Avg loss: 0.505678 

Epoch : 3



100%|██████████| 30729/30729 [01:31<00:00, 336.22it/s]


Train Error: 
 Accuracy: 79.8%, Avg loss: 0.503389 

Test Error: 
 Accuracy: 79.7%, Avg loss: 0.504778 

Epoch : 4



100%|██████████| 30729/30729 [01:30<00:00, 340.75it/s]


Train Error: 
 Accuracy: 80.0%, Avg loss: 0.501560 

Test Error: 
 Accuracy: 79.8%, Avg loss: 0.503646 

Epoch : 5



100%|██████████| 30729/30729 [01:37<00:00, 315.82it/s]


Train Error: 
 Accuracy: 80.2%, Avg loss: 0.500427 

Test Error: 
 Accuracy: 80.0%, Avg loss: 0.502328 

Epoch : 6



100%|██████████| 30729/30729 [01:21<00:00, 378.31it/s]


Train Error: 
 Accuracy: 80.3%, Avg loss: 0.499470 

Test Error: 
 Accuracy: 79.9%, Avg loss: 0.502763 

Epoch : 7



100%|██████████| 30729/30729 [01:18<00:00, 392.70it/s]


Train Error: 
 Accuracy: 80.4%, Avg loss: 0.498740 

Test Error: 
 Accuracy: 80.0%, Avg loss: 0.501754 

Epoch : 8



100%|██████████| 30729/30729 [01:40<00:00, 304.96it/s]


Train Error: 
 Accuracy: 80.4%, Avg loss: 0.498208 

Test Error: 
 Accuracy: 80.0%, Avg loss: 0.501665 

Epoch : 9



100%|██████████| 30729/30729 [01:24<00:00, 362.57it/s]


Train Error: 
 Accuracy: 80.5%, Avg loss: 0.497686 

Test Error: 
 Accuracy: 80.0%, Avg loss: 0.502069 

Epoch : 10



100%|██████████| 30729/30729 [02:04<00:00, 247.55it/s]


Train Error: 
 Accuracy: 80.5%, Avg loss: 0.497409 



KeyboardInterrupt: 

In [24]:
LRmodel = LogisticRegression(max_iter = 10000, n_jobs=-1)
LRmodel.fit(X_train, y_train)
y_pred3 = LRmodel.predict(X_test)
print(classification_report(y_test, y_pred3))

In [None]:
from sklearn.svm import LinearSVC

LSVC = LinearSVC(verbose=1, max_iter=10000)
LSVC.fit(X_train, y_train)
y_pred3 = LSVC.predict(X_test)
print(classification_report(y_test, y_pred3))

[LibLinear]              precision    recall  f1-score   support

           0       0.77      0.75      0.76     19810
           1       0.76      0.77      0.76     19584

    accuracy                           0.76     39394
   macro avg       0.76      0.76      0.76     39394
weighted avg       0.76      0.76      0.76     39394



