In [None]:
import pandas as pd
import string
import nltk
import spacy
import re
import torch.nn.functional as F
import seaborn as sns
from keras.utils import pad_sequences
from sklearn import preprocessing
from torch import nn
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('data/Tweets.csv')
df.head(5)

In [None]:
df.groupby('airline_sentiment')['text'].nunique()

In [None]:
df_iter = df.groupby('airline_sentiment')['text'].nunique().sort_values(ascending=False).reset_index(drop=True)
group = []
values = []
for k,v in df_iter.items():
    group.append(k)
    values.append(v)
df_nunique = pd.DataFrame({'group' : group , 'values':values})

In [None]:
sns.barplot(
    x="group", 
    y="values", 
    data=df_nunique, 
    estimator=sum, 
    ci=None, 
    color='#69b3a2'
);

In [None]:
lemmatizer = WordNetLemmatizer()

# since these are tweets, remove the @s to the airline and other users
def remove_statics(text):
    text = re.sub('@([A-Za-z0-9a_]+)', '' , text)
    return text

def remove_punctuations(text):
    text = str(text)
    return text.translate(str.maketrans('' , '' , string.punctuation)).lower()

def remove_non_alnum(text):
    text = str(text)
    text_list = [ch for ch in text.split() if ch.isalnum()]
    return ' '.join(text_list)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = text.split()
    return ' '.join([e for e in text if e not in stop_words])

def lemmatize(text):
    lemmatized_text = []
    for word in text.split():
        lemmatized_text.append(lemmatizer.lemmatize(word))
    return ' '.join(lemmatized_text)

def pos_tag(tokenized_text):
    tokenized_text = tokenized_text.split()
    return nltk.pos_tag(tokenized_text)

In [None]:
# df['preprocessed_text'] = df['text'].apply(remove_stopwords)
df['preprocessed_text'] = df['text'].apply(remove_statics)
df['preprocessed_text'] = df['preprocessed_text'].apply(lemmatize)
# df['preprocessed_text'] = df['preprocessed_text'].apply((remove_punctuations))
# df['preprocessed_text'] = df['preprocessed_text'].apply(remove_non_alnum)
df['text_pos'] = df['text'].apply(pos_tag)
df.head(5)

In [None]:
labels = df['airline_sentiment'].unique()
le = preprocessing.LabelEncoder()
encoded_labels = le.fit(labels)
print(encoded_labels.classes_)

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [None]:
class Tweets(Dataset):
    def __init__(self, df):
        self.df = df
        self.label_map = {'neutral' : 0 , 'positive': 1 , 'negative':2}
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['preprocessed_text']
        label = self.df.iloc[idx]['airline_sentiment']
        return self.label_map[label] , text

In [None]:
MAX_FEATURES = 5000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(df['preprocessed_text'].tolist())

def collate_fn(batch):
    data = [item[1] for item in batch]
    target = [item[0] for item in batch]
    data = tokenizer.texts_to_sequences(data)
    data = pad_sequences(data, maxlen=MAX_LEN)
    data = torch.tensor(data, dtype=torch.int64)
    target = torch.tensor(target, dtype=torch.int64)
    return data , target

In [None]:
from torch.utils.data import WeightedRandomSampler , random_split

train_size = int(0.8 * len(df) - 1)
test_size = len(df) - train_size

train_dataset, test_dataset = random_split(Tweets(df), [train_size, test_size])

sampler = WeightedRandomSampler([0.5 , 0.5 , 0.5] , num_samples=train_size)

train = DataLoader(train_dataset,collate_fn=collate_fn , batch_size=64, sampler=sampler)
valid = DataLoader(test_dataset,collate_fn=collate_fn, batch_size=64, shuffle=True)

In [None]:
EMBED_SIZE = 32
class TextClassification(nn.Module):
    
    def __init__(self):
        super(TextClassification, self).__init__()
        n_classes = len(le.classes_)
        self.embedding = nn.Embedding(MAX_FEATURES, MAX_LEN)
        self.lstm = nn.LSTM(MAX_LEN , 256, batch_first=True , bidirectional = True)
        self.dropout = nn.Dropout(p=0.25)
        self.linear1 = nn.Linear(512 , n_classes)
        self.linearf = nn.Linear(n_classes, 1)
        
    def forward(self, x):
        embed_result = self.embedding(x)
        output,_ = self.lstm(embed_result)
        dropout = self.dropout(output)
        x = self.linear1(dropout)
        x = F.relu(x)
        x = self.linearf(x)
        x = F.softmax(x,dim=1)
        print(x[1])
        return x

In [None]:
model = TextClassification()
print(model)

In [None]:
import numpy as np
import time

def train_loop(train):
    total_acc, total_count = 0, 0
    for idx, (x_batch, y_batch) in tqdm(enumerate(train)):
        optimizer.zero_grad()
        predicted_label = model(x_batch)
        predicted_label = predicted_label.squeeze()
        loss = criterion(predicted_label, y_batch)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        total_acc += (predicted_label.argmax(-1) == y_batch).sum().item()
        total_count += y_batch.size(0)
    return total_acc/total_count


In [None]:
def evaluate(valid):
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (x_batch, y_batch) in tqdm(enumerate(valid)):
            predicted_label = model(x_batch)
            predicted_label = predicted_label.squeeze()
            total_acc += (predicted_label.argmax(-1) == y_batch).sum().item()
            total_count += y_batch.size(0)
    return total_acc/total_count

In [None]:
EPOCHS = 15
LR = 0.0001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
for epoch in range(EPOCHS):
    epoch_start_time = time.time()
    model.train()
    train_acc = train_loop(train)
    model.eval()    
    val_acc = evaluate(valid)
    print('Epoch ' , epoch + 1, '|' , 'train_acc:' , "{:.2f}".format(train_acc) , '|' ,'val_acc: ', "{:.2f}".format(val_acc))

In [None]:
torch.save(model , 'model_lstm.h5')