In [None]:
import os
import zipfile
import pandas as pd
import string
from nltk import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from unidecode import unidecode
from nltk.tokenize import word_tokenize
from sklearn.utils import resample

cList = {
    "i`m": "i am",
    "you`re": "you are",
    "it`s": "it is",
    "we`re": "we are",
    "we`ll": "we will",
    "That`s": "that is",
    "haven`t": "have not",
    "let`s": "let us",
    "ain`t": "am not / are not / is not / has not / have not",
    "aren`t": "are not / am not",
    "can`t": "cannot",
    "can`t`ve": "cannot have",
    "`cause": "because",
    "could`ve": "could have",
    "couldn`t": "could not",
    "couldn`t`ve": "could not have",
    "didn`t": "did not",
    "doesn`t": "does not",
    "don`t": "do not",
    "hadn`t": "had not",
    "hadn`t`ve": "had not have",
    "hasn`t": "has not",
    "haven`t": "have not",
    "he`d": "he had / he would",
    "he`d`ve": "he would have",
    "he`ll": "he shall / he will",
    "he`ll`ve": "he shall have / he will have",
    "he`s": "he has / he is",
    "how`d": "how did",
    "how`d`y": "how do you",
    "how`ll": "how will",
    "how`s": "how has / how is / how does",
    "I`d": "I had / I would",
    "I`d`ve": "I would have",
    "I`ll": "I shall / I will",
    "I`ll`ve": "I shall have / I will have",
    "I`m": "I am",
    "I`ve": "I have",
    "isn`t": "is not",
    "it`d": "it had / it would",
    "it`d`ve": "it would have",
    "it`ll": "it shall / it will",
    "it`ll`ve": "it shall have / it will have",
    "it`s": "it has / it is",
    "let`s": "let us",
    "ma`am": "madam",
    "mayn`t": "may not",
    "might`ve": "might have",
    "mightn`t": "might not",
    "mightn`t`ve": "might not have",
    "must`ve": "must have",
    "mustn`t": "must not",
    "mustn`t`ve": "must not have",
    "needn`t": "need not",
    "needn`t`ve": "need not have",
    "o`clock": "of the clock",
    "oughtn`t": "ought not",
    "oughtn`t`ve": "ought not have",
    "shan`t": "shall not",
    "sha`n`t": "shall not",
    "shan`t`ve": "shall not have",
    "she`d": "she had / she would",
    "she`d`ve": "she would have",
    "she`ll": "she shall / she will",
    "she`ll`ve": "she shall have / she will have",
    "she`s": "she has / she is",
    "should`ve": "should have",
    "shouldn`t": "should not",
    "shouldn`t`ve": "should not have",
    "so`ve": "so have",
    "so`s": "so as / so is",
    "that`d": "that would / that had",
    "that`d`ve": "that would have",
    "that`s": "that has / that is",
    "there`d": "there had / there would",
    "there`d`ve": "there would have",
    "there`s": "there has / there is",
    "they`d": "they had / they would",
    "they`d`ve": "they would have",
    "they`ll": "they shall / they will",
    "they`ll`ve": "they shall have / they will have",
    "they`re": "they are",
    "they`ve": "they have",
    "to`ve": "to have",
    "wasn`t": "was not",
    "we`d": "we had / we would",
    "we`d`ve": "we would have",
    "we`ll": "we will",
    "we`ll`ve": "we will have",
    "we`re": "we are",
    "we`ve": "we have",
    "weren`t": "were not",
    "what`ll": "what shall / what will",
    "what`ll`ve": "what shall have / what will have",
    "what`re": "what are",
    "what`s": "what has / what is",
    "what`ve": "what have",
    "when`s": "when has / when is",
    "when`ve": "when have",
    "where`d": "where did",
    "where`s": "where has / where is",
    "where`ve": "where have",
    "who`ll": "who shall / who will",
    "who`ll`ve": "who shall have / who will have",
    "who`s": "who has / who is",
    "who`ve": "who have",
    "why`s": "why has / why is",
    "why`ve": "why have",
    "will`ve": "will have",
    "won`t": "will not",
    "won`t`ve": "will not have",
    "would`ve": "would have",
    "wouldn`t": "would not",
    "wouldn`t`ve": "would not have",
    "y`all": "you all",
    "y`all`d": "you all would",
    "y`all`d`ve": "you all would have",
    "y`all`re": "you all are",
    "y`all`ve": "you all have",
    "you`d": "you had / you would",
    "you`d`ve": "you would have",
    "you`ll": "you shall / you will",
    "you`ll`ve": "you shall have / you will have",
    "you`re": "you are",
    "you`ve": "you have"
}

extra_punctuations = ['', '.', '``', '...', '\'s', '--', '-', 'n\'t', '_', '–', '&']
stopword_list = stopwords.words('english') + list(string.punctuation) + extra_punctuations + ['u', 'the', 'us', 'say',
                                                                                              'that', 'he', 'me', 'she',
                                                                                              'get', 'rt', 'it', 'mt',
                                                                                              'via', 'not', 'and',
                                                                                              'let', 'so', 'say',
                                                                                              'dont', 'use', 'you',
                                                                                              'null']
import regex as re

c_re = re.compile('(%s)' % '|'.join(cList.keys()))


def expandContractions(text, c_re=c_re):
    def replace(match):
        return cList[match.group(0)]

    return c_re.sub(replace, text)


def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


def remove_punctuations(data):
    punct_tag = re.compile(r'[^\w\s]')
    data = punct_tag.sub(r'', data)
    return data


def removeSpecialChars(data):
    '''
    Removes special characters which are specifically found in tweets.
    '''
    # Converts HTML tags to the characters they represent
    # soup = BeautifulSoup(data, "html.parser")
    # data = soup.get_text()

    # Convert www.* or https?://* to empty strings
    data = re.sub('((www\.[^\s]+)|(http?://[^\s]+))', '', data)

    # Convert @username to empty strings
    data = re.sub('@[^\s]+', '', data)

    # remove org.apache. like texts
    data = re.sub('(\w+\.){2,}', '', data)

    # Remove additional white spaces
    data = re.sub('[\s]+', ' ', data)

    data = re.sub('\.(?!$)', '', data)

    # Replace #word with word
    data = re.sub(r'#([^\s]+)', r'\1', data)

    return data


def remove_nonenglish_charac(string):
    return re.sub('[^a-zA-Z]', ' ', string)


def text_cleaning(data):
    """Text Cleaning
    let us clean the dataset and remove the redundancies.This includes

    HTML codes
    URLs
    Emojis
    Stopwords
    Punctuations
    Expanding Abbreviations"""

    wordnet_lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    try:
        data = unidecode(data)
    except Exception as ex:
        print(f"#### the data is : " , data)
        raise ex
    data = expandContractions(data)
    # tokens = word_tokenize(data)
    # data = ' '.join([tok for tok in tokens if len(tok) > 2 if tok not in stopword_list and not tok.isdigit()])
    data = re.sub('\b\w{,2}\b', '', data)
    data = re.sub(' +', ' ', data)
    data = removeSpecialChars(data)
    data = remove_emoji(data)
    # data = [stemmer.stem(w) for w in data.split()]
    # data = ' '.join([wordnet_lemmatizer.lemmatize(word) for word in data])
    return data


def step_1_get_dataframe(filename, filetype="json"):
    if filetype == "csv":
        train_df = pd.read_csv(filename)
    if filetype == "json":
        train_df = pd.read_json(filename)
    train_df.drop_duplicates(keep='first').count()
    train_df = train_df.dropna()
    train_df_shuffled = train_df.sample(frac=1, random_state=42)  # shuffle with random_state=42 for reproducibility
    print(train_df_shuffled.head())
    print(train_df_shuffled.columns)
    return train_df_shuffled


def remove_columns(df, cols: list, ):
    df.drop(cols, axis=1, inplace=True)
    print(df.head)
    print(df.columns)
    return df


def rename_column(df,original, new, ):
    df.rename({original: new}, axis=1, inplace=True)
    print(df.columns)
    return df


def label_count_rebalancing(df,labels):
    dataframes = []
    print("### Old count: ")
    print(df.labels.value_counts())
    for label in labels:
        dataframes.append(df[df['labels'] == label])

    lowest = 9999999999
    for df in dataframes:
        if len(df) < lowest:
            lowest = len(df)

    balanced_df = []
    for df in dataframes:
        balanced_df.append(
            resample(df, replace=False, n_samples=lowest)
        )

    train_df = pd.concat(balanced_df)
    print(f"### New counts ")
    print(train_df.labels.value_counts())
    return train_df.sample(frac=1, random_state=42)


def unzip_data(filename):
    """
  Unzips filename into the current working directory.

  Args:
    filename (str): a filepath to a target zip folder to be unzipped.
  """
    if os.path.exists(filename):
        zip_ref = zipfile.ZipFile(filename, "r")
        zip_ref.extractall()
        zip_ref.close()
        os.remove(filename)
    else:
        print(f"File not found")


def apply_text_cleaning(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: text_cleaning(x))
    return df


In [None]:
import tensorflow as tf
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras import layers
import keras
import numpy as np
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import TextVectorization
import tensorflow_hub as hub
from sklearn.model_selection import train_test_split

In [None]:
df = step_1_get_dataframe(filename="Tweets.csv", filetype="csv")

df = remove_columns(cols=['textID', 'selected_text'], df=df)

df = rename_column(original="sentiment", new="labels", df=df)

# df = label_count_rebalancing(labels=['neutral', 'positive', 'negative'], df=df)

train_df = apply_text_cleaning(df=df, column_name="text")

FileNotFoundError: ignored

In [None]:
rem_ind = []
for ind in train_df.index:
    tweet = train_df['text'][ind]
    if len(tweet.split()) > 10:
        rem_ind.append(ind)

In [None]:
train_df.drop(rem_ind)

Unnamed: 0,text,labels
1589,Enjoy! Family trumps everything,positive
6562,Clive it has / it is my birthday pat me,neutral
2603,congrats hey,positive
4004,is texting,neutral
27232,Tell him where.,neutral
...,...,...
11285,i wish paramore would come to ireland,neutral
21576,feels like warm things,neutral
5391,My best friend is in vegas without me,neutral
861,- fire and urban at rock challenge,neutral


In [None]:
X = train_df['text'].values
y = train_df['labels'].values

y = LabelEncoder().fit_transform(y)
y = to_categorical(y)

In [None]:
X[:5], y[:5]

(array([' Enjoy! Family trumps everything',
        ' --of them kinda turns me off of it all And then I buy more of them and dig a deeper hole, etc ;;',
        'Clive it has / it is my birthday pat me ', ' congrats hey',
        'is texting'], dtype=object),
 array([[0., 0., 1.],
        [1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [0., 1., 0.]], dtype=float32))

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<oov>")
tokenizer.fit_on_texts(X)

In [None]:
word_index = tokenizer.word_index
vocab_size = len(word_index)
vocab_size

27075

In [None]:
train_text, val_text, train_labels, val_labels = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
train_seq = tokenizer.texts_to_sequences(train_text)
val_seq = tokenizer.texts_to_sequences(val_text)

In [None]:
train_pad = tf.keras.preprocessing.sequence.pad_sequences(train_seq,maxlen=16,padding="post",truncating="post")
val_pad = tf.keras.preprocessing.sequence.pad_sequences(val_seq,maxlen=16,padding="post",truncating="post")

In [None]:
# downloading twitter glove embeddign words.
!wget https://nlp.stanford.edu/data/glove.6B.zip

--2023-09-28 04:08:52--  https://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-09-28 04:08:52--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2023-09-28 04:11:31 (5.19 MB/s) - ‘glove.6B.zip’ saved [862182613/862182613]



In [None]:
from zipfile import ZipFile

# loading the temp.zip and creating a zip object
with ZipFile("glove.6B.zip", 'r') as zObject:

    # Extracting all the members of the zip
    # into a specific location.
    zObject.extractall()

In [None]:
# Define path to file containing the embeddings
GLOVE_FILE = 'glove.6B.100d.txt'

# Initialize an empty embeddings index dictionary
GLOVE_EMBEDDINGS = {}

# Read file and fill GLOVE_EMBEDDINGS with its contents
with open(GLOVE_FILE) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        GLOVE_EMBEDDINGS[word] = coefs

test_word = 'everything'

test_vector = GLOVE_EMBEDDINGS[test_word]

print(f"Vector representation of word {test_word} looks like this:\n\n{test_vector}")

Vector representation of word everything looks like this:

[ 0.013026   0.33335    0.62812   -0.089324  -0.13434    0.26948
 -0.17045    0.039592   0.19847   -0.054669   0.59498    0.31745
  0.28691    0.42015   -0.23682   -0.3583    -0.45421    0.87357
 -0.26887    0.38228   -0.013516   0.18781    0.11409   -0.91682
  0.17019    0.14309   -0.52976   -0.9702    -0.21943   -0.4512
 -0.25972    0.55875   -0.07939    0.098255  -0.15482    0.11926
  0.034281  -0.079117  -0.18668   -0.64334    0.026627  -0.15963
  0.12529   -0.44979   -0.99793    0.13604   -0.28778   -0.059987
  0.019177  -1.2517     0.5363     0.37451    0.12018    0.93167
 -0.16836   -1.9662     0.19831    0.70928    1.3088     0.22569
 -0.028412   1.2327    -0.44345   -0.34264    0.6256     0.43331
  0.95851    0.030527  -0.10855   -0.17556    0.37907   -0.12585
  0.32332    0.10456    0.80106    0.32022   -0.073298   0.020626
 -0.53155    0.47124    0.73076    0.0048294 -0.60191    0.36261
 -1.2432    -0.070735  -0.2273

In [None]:
EMBEDDING_DIM = 100

# Initialize an empty numpy array with the appropriate size
EMBEDDINGS_MATRIX = np.zeros((vocab_size+1, EMBEDDING_DIM))

# Iterate all of the words in the vocabulary and if the vector representation for
# each word exists within GloVe's representations, save it in the EMBEDDINGS_MATRIX array
for word, i in word_index.items():
    embedding_vector = GLOVE_EMBEDDINGS.get(word)
    if embedding_vector is not None:
        EMBEDDINGS_MATRIX[i] = embedding_vector

In [None]:
model = tf.keras.models.Sequential(
        [
            tf.keras.layers.Embedding(input_dim=vocab_size+1,
                                      output_dim=EMBEDDING_DIM,
                                      input_length=16,
                                      weights=[EMBEDDINGS_MATRIX]),
            tf.keras.layers.LSTM(64,return_sequences=True,dropout=0.3),
            tf.keras.layers.LSTM(32,dropout=0.5),
            tf.keras.layers.Dense(16,activation="relu"),
            tf.keras.layers.Dense(3,activation="softmax")

        ]
)

model.compile(
    loss="categorical_crossentropy",
    optimizer="adam",
    metrics=['accuracy']

)

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 16, 100)           2707600   
                                                                 
 lstm_2 (LSTM)               (None, 16, 64)            42240     
                                                                 
 lstm_3 (LSTM)               (None, 32)                12416     
                                                                 
 dense_2 (Dense)             (None, 16)                528       
                                                                 
 dense_3 (Dense)             (None, 3)                 51        
                                                                 
Total params: 2762835 (10.54 MB)
Trainable params: 2762835 (10.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = model.fit(train_pad,train_labels,epochs=2,validation_data=(val_pad,val_labels))

Epoch 1/2
Epoch 2/2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
model = tf.keras.models.Sequential(
    [
        tf.keras.layers.Embedding(
            input_dim=vocab_size + 1,
            output_dim=EMBEDDING_DIM,
            input_length=16,
            weights=[EMBEDDINGS_MATRIX],
            trainable=False  # Fix embedding layer weights
        ),
        tf.keras.layers.LSTM(32, return_sequences=True, dropout=0.2),
        tf.keras.layers.LSTM(16, dropout=0.2),
        tf.keras.layers.Dense(8, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.Dropout(0.5),  # Adding dropout to dense layer
        tf.keras.layers.Dense(3, activation="softmax")
    ]
)

model.compile(
    loss="categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.summary()

# Define early stopping callback
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=2,  # Number of epochs with no improvement after which training will be stopped
    restore_best_weights=True
)

history = model.fit(
    train_pad,
    train_labels,
    epochs=50,
    validation_data=(val_pad, val_labels),
    callbacks=[early_stopping]
)


NameError: ignored

In [None]:
model.evaluate(val_pad,val_labels)

In [None]:
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
import matplotlib.pyplot as plt
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = [*range(10)]

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
plt.plot(epochs, loss, 'r')
plt.plot(epochs, val_loss, 'b')
plt.title('Training and validation loss')
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.legend(["Loss", "Validation Loss"])
plt.show()

In [None]:
from scipy.stats import linregress

slope, *_ = linregress(epochs, val_loss)
print(f"The slope of your validation loss curve is {slope:.5f}") # should be lesse than 0.0005