In [1]:
#
# Import basic libraries
#

import pandas as pd
import numpy as np
import nltk 
import re

# Make sure the stopwords exist

_ = nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/alan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#
# Routines
#

# Remove punctuations comes intermediate after a word

def rm_postfix_punctuation(word:str):
    # Create a copy for editing
    word_tmp = word
    
    # Find locations of non-letter chars
    for match in (list(re.finditer(r'[^\w\s]', word))[::-1]): 
        # Remove the last char if the last char is not a letter
        if match.end() == len(word_tmp): 
            word_tmp = word_tmp[:len(word_tmp)-1] 
            
    return word_tmp

# Preprocessing for one tweet

def preprocess_one_tweet(the_tweet:np.ndarray):
    # Define a stemmer 
    stemmer = nltk.stem.snowball.SnowballStemmer("english")
    
    # Split words from a tweet and store it in a list after preprocessing
    tokens = [stemmer.stem(rm_postfix_punctuation(word)) \
             for word in the_tweet.lower().split() if \
             not (word.startswith("@") or re.search(r'://', word) or
                 word in nltk.corpus.stopwords.words("english")) ]
    
    # Remove empty "words"
    tokens = [word for word in tokens if len(word) > 0]
    
    return tokens

# Create corpus and word count for a list of tweets

def create_corpus(list_of_tweets:np.ndarray, size:int = 10000):
    # Create an empty dict for word counting
    word_count = dict()    
    
    for tweet in list_of_tweets:
        # Get the tokens from the routine
        tokens_tweet = preprocess_one_tweet(tweet) 
        
        # Add one to the counter for each token
        for word in tokens_tweet:
            word_count[word] = word_count.get(word, 0) + 1
            
    # Sort the word count for the corpus
    word_count = dict(sorted(word_count.items(), 
                             key=lambda x:x[1], reverse=True))
    
    # Create the corpus accordingly
    corpus = dict([(x[1], x[0]+1) for x in enumerate(word_count.keys())][:(size-2)])
    
    # Create a special token for not included words
    corpus[":empty:"] = 0
    corpus[":not_in_list:"] = size-1
    
    # Sort the words for better presentation
    corpus = dict(sorted(corpus.items(), 
                             key=lambda x:x[1], reverse=False))
            
    return corpus, word_count

# Vectorize a sentance

def vectorize_tweet(corpus:dict, tweet:str, max_tokens:int = 100):
    # Get the tokens from the routine
    tokens_tweet = preprocess_one_tweet(tweet) 
    
    # Shorten the tweet accordingly
    if len(tokens_tweet) > max_tokens:
        tokens_tweet = tokens_tweet[:max_tokens]
    
    # Create empty list for the word vec
    word_vec = list()
    
    # Getting the number of word according to corpus
    for token in tokens_tweet:
        word_vec.append(corpus.get(token, corpus[":not_in_list:"]))
    
    # If the tweet was too short, append 0s 
    if len(word_vec) < max_tokens:
        word_vec = word_vec + [0] * (max_tokens - len(word_vec))
        
    return word_vec

# Process the input x and output y

def generate_x_and_y(data:np.ndarray, corpus:dict, max_tokens:int = 100):
    # dict mapping labels to position in the one-hot vector
    labels = dict([(x[1],x[0]) for x in enumerate(list(set(data[:,0])))])
        
    x = []
    y = []
    
    # for loop run through the data set
    for idx in range(data.shape[0]):
        # Create one-hot vector for desired output 
        y_new = [0] * len(labels)
        y_new[labels[data[idx,0]]] = 1
        
        # Get the vectorized tweet as the input
        x_new = vectorize_tweet(corpus, data[idx,1], max_tokens)
        
        y.append(y_new)
        x.append(x_new)
        
    return np.array(x), np.array(y)

In [3]:
#
# Obtain the dataset downloaded from Kaggle
#

original_dataset = pd.read_csv("datasets/Sentiment140_tweets.csv", header=None)

In [4]:
#
# Showing first few lines of the dataset
#

original_dataset.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [5]:
#
# Handling the dataset and split it into different groups
#

# Split the datasets

shuffled_rank = np.arange(original_dataset.values.data.shape[0])
np.random.shuffle(shuffled_rank)
shuffled_rank = shuffled_rank[:original_dataset.values.data.shape[0]//5]

# Obtain the idx for different data-subsets

idx4train = shuffled_rank[:int(len(shuffled_rank)*0.2)]
idx4test = shuffled_rank[int(len(shuffled_rank)*0.2):]

# Assign the datasets

data_train = original_dataset.values[idx4train, 0:6:5]
data_test = original_dataset.values[idx4test, 0:6:5]

# Clean-up the original 

original_dataset = None

In [6]:
#
# Show an example of one tweet
#

print(data_train[0])

print(preprocess_one_tweet(data_train[0,1]))

[0
 '@JoyceCamp thank god, yes i am. i just dented the hell out of the hood, and it left a turd on the headlights ']
['thank', 'god', 'yes', 'am', 'dent', 'hell', 'hood', 'left', 'turd', 'headlight']


In [7]:
#
# Generate and check the corpus generated
#

corpus_train, word_count = create_corpus(data_train[:,1], 1000)

print(list(corpus_train.items())[:10])

[(':empty:', 0), ('go', 1), ("i'm", 2), ('get', 3), ('day', 4), ('good', 5), ('work', 6), ('like', 7), ('love', 8), ('today', 9)]


In [8]:
#
# Convert the data to vectors
#

x_train, y_train = generate_x_and_y(data_train, corpus_train, 20)
x_test, y_test = generate_x_and_y(data_test, corpus_train, 20)

In [9]:
#
# Free up the memory for my laptop
#

data_train = None
data_test = None

In [10]:
#
# Starting Design a Keras Model
#

import tensorflow.keras as keras

# Input for variable-length sequences of integers
inputs = keras.Input(shape=(None,), dtype="int32")
# Embed each integer in a 128-dimensional vector
x = keras.layers.Embedding(len(corpus_train), 128)(inputs)
# Add 2 bidirectional LSTMs
x = keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True))(x)
x = keras.layers.Bidirectional(keras.layers.LSTM(64))(x)
# Add a classifier
outputs = keras.layers.Dense(2, activation="softmax")(x)
model = keras.Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 128)         128000    
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 2)                 258       
Total params: 325,890
Trainable params: 325,890
Non-trainable params: 0
_________________________________________________________________


In [11]:
#
# Compile the model
#

model.compile(optimizer='adam',loss="binary_crossentropy",metrics=["accuracy"])

In [12]:
#
# Training the model
#

model.fit(x=x_train, y=y_train, batch_size=32, epochs=2, validation_data=(x_test, y_test))

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7fe7bc71adc0>