In [None]:
import pandas as pd
import string
import nltk
import spacy
import re
import torch.nn.functional as F
import seaborn as sns
from keras.utils import pad_sequences
from sklearn import preprocessing
from torch import nn
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
from tqdm import tqdm

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load('en_core_web_sm')

In [None]:
df = pd.read_csv('data/Tweets.csv')
df.head(5)

In [None]:
df.groupby('airline_sentiment')['text'].nunique()

In [None]:
df_iter = df.groupby('airline_sentiment')['text'].nunique().sort_values(ascending=False).reset_index(drop=True)
group = []
values = []
for k,v in df_iter.items():
    group.append(k)
    values.append(v)
df_nunique = pd.DataFrame({'group' : group , 'values':values})

In [None]:
sns.barplot(
    x="group", 
    y="values", 
    data=df_nunique, 
    estimator=sum, 
    ci=None, 
    color='#69b3a2'
);

In [None]:
lemmatizer = WordNetLemmatizer()

# since these are tweets, remove the @s to the airline and other users
def remove_statics(text):
    text = re.sub('@([A-Za-z0-9a_]+)', '' , text)
    return text

def remove_punctuations(text):
    text = str(text)
    return text.translate(str.maketrans('' , '' , string.punctuation)).lower()

def remove_non_alnum(text):
    text = str(text)
    text_list = [ch for ch in text.split() if ch.isalnum()]
    return ' '.join(text_list)

def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    text = text.split()
    return ' '.join([e for e in text if e not in stop_words])

def lemmatize(text):
    lemmatized_text = []
    for word in text.split():
        lemmatized_text.append(lemmatizer.lemmatize(word))
    return ' '.join(lemmatized_text)

def pos_tag(tokenized_text):
    tokenized_text = tokenized_text.split()
    return nltk.pos_tag(tokenized_text)

In [None]:
df['preprocessed_text'] = df['text'].apply(remove_stopwords)
df['preprocessed_text'] = df['preprocessed_text'].apply(remove_statics)
df['preprocessed_text'] = df['preprocessed_text'].apply(lemmatize)
df['preprocessed_text'] = df['preprocessed_text'].apply((remove_punctuations))
df['preprocessed_text'] = df['preprocessed_text'].apply(remove_non_alnum)
df['text_pos'] = df['text'].apply(pos_tag)
df.head(5)

In [None]:
labels = df['airline_sentiment'].unique()
le = preprocessing.LabelEncoder()
encoded_labels = le.fit(labels)
print(encoded_labels.classes_)

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [None]:
class Tweets(Dataset):
    def __init__(self, df):
        self.df = df
        self.label_map = {'neutral' : 0 , 'positive': 1 , 'negative':2}
    
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx]['preprocessed_text']
        label = self.df.iloc[idx]['airline_sentiment']
        return self.label_map[label] , text

In [None]:
MAX_FEATURES = 5000
MAX_LEN = 50

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(df['preprocessed_text'].tolist())

def collate_fn(batch):
    data = [item[1] for item in batch]
    target = [item[0] for item in batch]
    data = tokenizer.texts_to_sequences(data)
    data = pad_sequences(data, maxlen=MAX_LEN)
    data = torch.tensor(data, dtype=torch.long)
    target = torch.tensor(target, dtype=torch.long)
    return data , target

In [52]:
train_size = int(0.8 * len(df) - 1)
test_size = len(df) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(Tweets(df), [train_size, test_size])

train = DataLoader(train_dataset,collate_fn=collate_fn , batch_size=128, shuffle=True)
valid = DataLoader(test_dataset,collate_fn=collate_fn, batch_size=128, shuffle=True)

In [53]:
filter_sizes = [2 , 3 , 4]
class TextClassification(nn.Module):
    
    def __init__(self):
        super(TextClassification, self).__init__()
        n_classes = len(le.classes_)

        self.embedding = nn.Embedding(MAX_FEATURES, 100)
        
        self.conv1 = nn.Conv1d(MAX_LEN , 256 , kernel_size = filter_sizes[0], stride=1)
        self.pool_1 = nn.AdaptiveMaxPool1d(2048)
        
        # self.conv2 = nn.Conv1d(MAX_LEN , 256 , kernel_size = filter_sizes[1] , stride= 1)
        # self.padding2 = nn.ConstantPad1d(padding=1 , value=0)
        # self.pool_2 = nn.MaxPool1d(kernel_size=filter_sizes[1], stride=1)

        # self.conv3 = nn.Conv1d(MAX_LEN , 256 , kernel_size = filter_sizes[0] , stride=1)
        # self.pool_3 = nn.MaxPool1d(kernel_size=filter_sizes[0] , stride=1)
        
        self.linear1 = nn.Linear(2048 , n_classes)
        self.norm1 = nn.BatchNorm1d(256)
        self.linearf = nn.Linear(n_classes, 1)

    def forward(self, x):
        embed_result = self.embedding(x)

        conv1 = self.conv1(embed_result)
        conv1 = F.relu(conv1)
        pool1 = self.pool_1(conv1)

        # conv2 = self.conv2(embed_result)
        # conv2 = F.relu(conv2)
        # pool2 = self.pool_2(self.padding2(conv2))

        # conv3 = self.conv3(embed_result)
        # conv3 = F.relu(conv3)
        # pool3 = self.pool_3(conv3)
        
        # union = torch.concat((pool1 , pool2 , pool3) , 1)

        x = self.linear1(pool1)
        x = F.relu(x)
        x = self.norm1(x)
        x = self.linearf(x)
        
        return F.softmax(x , dim=1)

In [102]:
class TextClassifier(nn.ModuleList):

   def __init__(self):
      super(TextClassifier, self).__init__()

      # Parameters regarding text preprocessing
      self.seq_len = MAX_LEN
      self.num_words = MAX_FEATURES
      self.embedding_size = 100
      
      # Dropout definition
      self.dropout = nn.Dropout(0.25)
      
      # CNN parameters definition
      # Kernel sizes
      self.kernel_1 = 2
      self.kernel_2 = 3
      self.kernel_3 = 4
      self.kernel_4 = 5
      
      # Output size for each convolution
      self.out_size = 32
      # Number of strides for each convolution
      self.stride = 1
      
      # Embedding layer definition
      self.embedding = nn.Embedding(self.num_words + 1, self.embedding_size, padding_idx=0)
      
      # Convolution layers definition
      self.conv_1 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_1, self.stride)
      self.conv_2 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_2, self.stride)
      self.conv_3 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_3, self.stride)
      self.conv_4 = nn.Conv1d(self.seq_len, self.out_size, self.kernel_4, self.stride)
      
      # Max pooling layers definition
      self.pool_1 = nn.MaxPool1d(self.kernel_1, self.stride)
      self.pool_2 = nn.MaxPool1d(self.kernel_2, self.stride)
      self.pool_3 = nn.MaxPool1d(self.kernel_3, self.stride)
      self.pool_4 = nn.MaxPool1d(self.kernel_4, self.stride)
      
      # Fully connected layer definition
      self.fc = nn.Linear(380, 1)



   def forward(self, x):
         # Sequence of tokes is filterd through an embedding layer
         x = self.embedding(x)
         
         # Convolution layer 1 is applied
         x1 = self.conv_1(x)
         x1 = torch.relu(x1)
         x1 = self.pool_1(x1)
         
         # Convolution layer 2 is applied
         x2 = self.conv_2(x)
         x2 = torch.relu((x2))
         x2 = self.pool_2(x2)
   
         # Convolution layer 3 is applied
         x3 = self.conv_3(x)
         x3 = torch.relu(x3)
         x3 = self.pool_3(x3)
         
         # Convolution layer 4 is applied
         x4 = self.conv_4(x)
         x4 = torch.relu(x4)
         x4 = self.pool_4(x4)
         
         # The output of each convolutional layer is concatenated into a unique vector
         union = torch.cat((x1, x2, x3, x4), 2)
         # union = union.reshape(union.size(0), -1)

         # The "flattened" vector is passed through a fully connected layer
         out = self.fc(union)
         # Dropout is applied		
         out = self.dropout(out)
         # Activation function is applied
         out = torch.sigmoid(out)
         
         return out

In [103]:
model = TextClassifier()
print(model)

TextClassifier(
  (dropout): Dropout(p=0.25, inplace=False)
  (embedding): Embedding(5001, 100, padding_idx=0)
  (conv_1): Conv1d(1024, 32, kernel_size=(2,), stride=(1,))
  (conv_2): Conv1d(1024, 32, kernel_size=(3,), stride=(1,))
  (conv_3): Conv1d(1024, 32, kernel_size=(4,), stride=(1,))
  (conv_4): Conv1d(1024, 32, kernel_size=(5,), stride=(1,))
  (pool_1): MaxPool1d(kernel_size=2, stride=1, padding=0, dilation=1, ceil_mode=False)
  (pool_2): MaxPool1d(kernel_size=3, stride=1, padding=0, dilation=1, ceil_mode=False)
  (pool_3): MaxPool1d(kernel_size=4, stride=1, padding=0, dilation=1, ceil_mode=False)
  (pool_4): MaxPool1d(kernel_size=5, stride=1, padding=0, dilation=1, ceil_mode=False)
  (fc): Linear(in_features=380, out_features=1, bias=True)
)


In [104]:
import numpy as np
import time
def train_loop(train):
    total_acc, total_count = 0, 0
    for idx, (x_batch, y_batch) in tqdm(enumerate(train)):
        optimizer.zero_grad()
        predicted_label = model(x_batch)
        predicted_label = predicted_label.squeeze()
        loss = criterion(predicted_label, y_batch)
        loss.backward()
        optimizer.step()
        total_acc += (predicted_label.argmax(-1) == y_batch).sum().item()
        total_count += y_batch.size(0)
    return total_acc/total_count


In [105]:
def evaluate(valid):
    total_acc, total_count = 0, 0
    with torch.no_grad():
        for idx, (x_batch, y_batch) in tqdm(enumerate(valid)):
            predicted_label = model(x_batch)
            predicted_label = predicted_label.squeeze()
            total_acc += (predicted_label.argmax(-1) == y_batch).sum().item()
            total_count += y_batch.size(0)
    return total_acc/total_count

In [106]:
EPOCHS = 5
LR = 0.0001
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
for epoch in range(EPOCHS):
    epoch_start_time = time.time()
    model.train()
    train_acc = train_loop(train)
    model.eval()    
    val_acc = evaluate(valid)
    print('Epoch ' , epoch + 1, '|' , 'train_acc:' , "{:.2f}".format(train_acc) , '|' ,'val_acc: ', "{:.2f}".format(val_acc))

92it [01:44,  1.14s/it]
23it [00:18,  1.22it/s]


Epoch  1 | train_acc: 0.17 | val_acc:  0.51


92it [01:48,  1.17s/it]
23it [00:19,  1.21it/s]


Epoch  2 | train_acc: 0.39 | val_acc:  0.42


92it [01:46,  1.16s/it]
23it [00:19,  1.20it/s]


Epoch  3 | train_acc: 0.33 | val_acc:  0.28


92it [01:47,  1.17s/it]
23it [00:19,  1.17it/s]


Epoch  4 | train_acc: 0.28 | val_acc:  0.24


92it [01:48,  1.18s/it]
23it [00:19,  1.17it/s]

Epoch  5 | train_acc: 0.25 | val_acc:  0.23





In [None]:
torch.save(model , 'model_cnn.h5')