In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score

# from imblearn.over_sampling import SMOTE

from gensim import downloader as api
from tqdm import tqdm
from nltk import word_tokenize

import torch
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler

# from transformers import AutoModelForSeq2SeqLM
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import pipeline
from util.helpers import evaluate_score

In [2]:
data_path = "data/"
df = pd.read_csv(data_path + "cleaned_Tweets.csv", index_col=0)

In [3]:
vectorizer = TfidfVectorizer(min_df=0.0001, max_df=0.9999, ngram_range=(1,3), max_features=10000)
X = vectorizer.fit_transform(df.text)

scaler = StandardScaler(with_mean=False)
scaler.fit(X)

xTrain, xTest, yTrain, yTest = train_test_split(X, df.airline_sentiment, train_size=0.8, stratify=df.airline_sentiment)

In [4]:
clf = OneVsRestClassifier(Perceptron(n_jobs=-1), n_jobs=-1)
clf.fit(scaler.transform(xTrain), yTrain)

yPred = clf.predict(xTest)

evaluate_score(yTest, yPred)

Accuracy:	 0.7776639344262295
Precision:	 [0.82211302 0.61380597 0.77030812]
Recall: 	 [0.91171662 0.53064516 0.58139535]
F1 scores:	 [0.86459948 0.56920415 0.6626506 ]
Average
	Precision: 0.769635532570646
	Recall: 0.7776639344262295
	F1: 0.7694263528052627


In [5]:
clf = LinearSVC()
clf.fit(xTrain, yTrain)

yPred = clf.predict(xTest)

evaluate_score(yTest, yPred)

Accuracy:	 0.7971311475409836
Precision:	 [0.84631148 0.64663024 0.76580796]
Recall: 	 [0.90027248 0.57258065 0.69133192]
F1 scores:	 [0.87245841 0.60735672 0.72666667]
Average
	Precision: 0.791024409312084
	Recall: 0.7971311475409836
	F1: 0.7927717485366458


In [6]:
wv = api.load('word2vec-google-news-300')

def make_avg_embeds(data):
    
    embeds = []

    for rev in tqdm(data.text):
        vec = []
        tokens = word_tokenize(rev)
        
        for tok in tokens:
            try:
                wordVec = wv[tok]
                vec.append(wordVec.tolist())
            except:
                pass
            
        
        vec = np.asarray(vec)
        vec = vec.mean(axis=0)
        
        embeds.append(pd.Series(vec))

    embeds = pd.concat(embeds, axis=1).transpose()
    embeds = embeds.fillna(0)
    return embeds

In [7]:
embeds = make_avg_embeds(df)
xTrain, xTest, yTrain, yTest = train_test_split(embeds, df.airline_sentiment, train_size=0.8, stratify=df.airline_sentiment)

  vec = vec.mean(axis=0)
  ret = ret.dtype.type(ret / rcount)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 14640/14640 [00:05<00:00, 2569.35it/s]


In [8]:
clf = OneVsRestClassifier(Perceptron(n_jobs=-1), n_jobs=-1)
# clf.fit(scaler.transform(xTrain), yTrain)
clf.fit(xTrain, yTrain)

yPred = clf.predict(xTest)

precisions = precision_score(yTest, yPred, average=None)
recalls = recall_score(yTest, yPred, average=None)
f1s = f1_score(yTest, yPred, average=None)

evaluate_score(yTest, yPred)

Accuracy:	 0.7544398907103825
Precision:	 [0.74595939 0.78680203 0.79874214]
Recall: 	 [0.98092643 0.25       0.53699789]
F1 scores:	 [0.84745763 0.37943696 0.64222503]
Average
	Precision: 0.7631344825282937
	Recall: 0.7544398907103825
	F1: 0.7152008550904367


In [9]:
clf = LinearSVC()
clf.fit(xTrain, yTrain)

yPred = clf.predict(xTest)

evaluate_score(yTest, yPred)

Accuracy:	 0.7899590163934426
Precision:	 [0.8088785  0.69849246 0.77948718]
Recall: 	 [0.94332425 0.4483871  0.64270613]
F1 scores:	 [0.8709434  0.54616896 0.70451912]
Average
	Precision: 0.7807564271193197
	Recall: 0.7899590163934426
	F1: 0.7752880566765624


In [10]:
torch.multiprocessing.set_sharing_strategy("file_system")
from pysentimiento import create_analyzer
analyzer = create_analyzer(task="sentiment", lang="en")

df_raw = pd.read_csv(data_path + "Tweets.csv")
predictions = analyzer.predict(df_raw.text)

  0%|          | 0/458 [00:00<?, ?ba/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: text. If text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 14640
  Batch size = 32


In [11]:
preds = []

for pred in predictions:
    pred = pred.output
    if pred == "NEU":
        preds.append(0)
    elif pred == "POS":
        preds.append(1)
    elif pred == "NEG":
        preds.append(-1)

yTrue = []

for val in df_raw.airline_sentiment:
    if val == "neutral":
        yTrue.append(0)
    elif val == "positive":
        yTrue.append(1)
    elif val == "negative":
        yTrue.append(-1)

evaluate_score(yTrue, preds)

Accuracy:	 0.7864071038251366
Precision:	 [0.91724813 0.54951034 0.71556351]
Recall: 	 [0.8164088  0.65182317 0.84638172]
F1 scores:	 [0.86389577 0.59630996 0.77549438]
Average
	Precision: 0.806851944470274
	Recall: 0.7864071038251366
	F1: 0.7929845087635168


In [42]:
wordvec = pd.concat([df.airline_sentiment, pd.DataFrame(embeds, columns=range(300))], axis=1)
wordvec.columns = range(301)

In [43]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [44]:
class FNN(nn.Module):
    
    def __init__(self, in_size):
        super(FNN, self).__init__()
        
        h1 = 50
        h2 = 10
        
        self.fc1 = nn.Linear(in_size, h1)
        self.fc2 = nn.Linear(h1, h2)
        self.fc3 = nn.Linear(h2, 3)
        
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        # x = F.dropout(x, p=0.5)
        
        x = F.relu(self.fc2(x))
        # x = F.dropout(x, p=0.2)
        x = self.fc3(x)
        # x = F.softmax(x, dim=1)
        return x

class WordVecData(Dataset):
    
    def __init__(self, data) -> None:
        super(WordVecData, self).__init__()
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        
        feature = self.data[idx, 1:]
        label = int(self.data[idx, 0]) + 1
        
        return feature, label

In [45]:
train, test = train_test_split(wordvec, test_size=0.2, stratify=wordvec.iloc[:,0])

train = torch.tensor(train.to_numpy(), device=device, dtype=torch.float, requires_grad=True)
test = torch.tensor(test.to_numpy(), device=device, dtype=torch.float, requires_grad=False)

num_train = len(train)
indices = list(range(num_train))
np.random.shuffle(indices)
cap = int(np.floor(0.2 * num_train))

train_idx, valid_idx = indices[cap:], indices[:cap]

train_sampler = SubsetRandomSampler(train_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

train_data = WordVecData(train)
test_data = WordVecData(test)

train_loader = DataLoader(train_data, batch_size=16, sampler=train_sampler)
valid_loader = DataLoader(train_data, batch_size=16, sampler=valid_sampler)
test_loader = DataLoader(test_data, batch_size=1)

In [None]:
model = FNN(300)
# print(model_fnn)

# model.load_state_dict(torch.load("model.pt"))

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=5e-2)
# optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

model.to(device)
criterion.to(device)

n_epochs = 50

valid_loss_min = np.Inf

for epoch in range(n_epochs):
    train_loss = 0.0
    valid_loss = 0.0
    
    model.train()

    for data, target in train_loader:

        optimizer.zero_grad()
        output = model(data)

        loss = criterion(output, target.to(device))
        loss.backward()
        
        optimizer.step()
        
        train_loss += loss.item()*data.size(0)
    
    model.eval()

    for data, target in valid_loader:
        
        output = model(data)
        loss = criterion(output, target.to(device))
        valid_loss += loss.item()*data.size(0)
    
    train_loss = train_loss/len(train_loader.dataset)
    valid_loss = valid_loss/len(valid_loader.dataset)
    
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch+1, 
        train_loss,
        valid_loss
        ))
    
    # save model if validation loss has decreased
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model.pt')
        valid_loss_min = valid_loss

In [47]:
yTest = [int(vec[0]) for vec in test]

model = FNN(300)
model.load_state_dict(torch.load("models/model_sent_78.pt"))
model.to(device)
model.eval()
# model.to(device)

preds = []

with torch.no_grad():
    for data, target in tqdm(test_loader):
        output = model(data)
        preds.append(int(output[0].argmax()) - 1)
        # print(output[0], " | ", int(output[0].argmax()), ":", int(target))


evaluate_score(yTest, preds)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 2928/2928 [00:01<00:00, 2197.00it/s]

Accuracy:	 0.7858606557377049
Precision:	 [0.81770335 0.66129032 0.7549505 ]
Recall: 	 [0.93133515 0.46290323 0.6448203 ]
F1 scores:	 [0.87082803 0.54459203 0.69555302]
Average
	Precision: 0.7744457753044501
	Recall: 0.7858606557377049
	F1: 0.7734334237781955





In [48]:
df_tok = pd.read_csv(data_path + "Tweets.csv")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased", Truncation=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

loading configuration file config.json from cache at C:\Users\Jonny/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at C:\Users\Jonny/.cache\huggingface\hub\models--distilbert-base-uncased\snapshots\043235d6088ecd3dd5fb5ca3592b6913fd516027\vocab.txt
loading file tokenizer.json from cache at C:\Users\Jonny/.cache\huggingface\hub\models--dis

In [60]:
data_og = pd.concat([df_tok.text, df.airline_sentiment], axis=1)

data = []
for t in data_og.iterrows():
    temp = {}
    temp["text"] = t[1].text
    temp["label"] = t[1].airline_sentiment + 1
    temp.update(tokenizer(t[1].text))
    data.append(temp)

train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)

In [None]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    evaluation_strategy = "steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
best_run = trainer.hyperparameter_search(n_trials=10, direction="maximize")
best_run

In [None]:
for n, v in best_run.hyperparameters.items():
    setattr(trainer.args, n, v)

trainer.train()

In [None]:
preds = trainer.predict(test)

yTest = [t["label"] for t in test]
yPred = [p.argmax() for p in preds[0]]
evaluate_score(yTest, yPred)

In [None]:
trainer.model.save_pretrained("models/tuned_distilbert_sentiment")

In [57]:
tuned_model = AutoModelForSequenceClassification.from_pretrained("models/tuned_distilbert_sentiment", num_labels=3)
clf = pipeline("text-classification", model=tuned_model, tokenizer=AutoTokenizer.from_pretrained("distilbert-base-uncased", Truncation=True))

loading configuration file tuned_distilbert_sentiment\config.json
Model config DistilBertConfig {
  "_name_or_path": "tuned_distilbert_sentiment",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.23.1",
  "vocab_size": 30522
}

loading weights file tuned_distilbert_sentiment\pytorch_model.bin
All model checkpoint weights were used when initializing DistilBe

In [61]:
test_set = [{k: v for k,v in t.items() if k == "text"} for t in test]
preds = clf(test_set)

In [62]:
label_map = {"LABEL_0":0,"LABEL_1":1,"LABEL_2":2}
yPred = [label_map[v] for t in preds for k,v in t.items() if k == "label" ]
yTest = [t["label"] for t in test]

evaluate_score(yTest, yPred)

Accuracy:	 0.8562158469945356
Precision:	 [0.86430678 0.82987552 0.84708738]
Recall: 	 [0.96487377 0.62695925 0.7457265 ]
F1 scores:	 [0.91182573 0.71428571 0.79318182]
Average
	Precision: 0.8540520613964016
	Recall: 0.8562158469945356
	F1: 0.8498189377228219
