In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import nltk
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

from nlpretext import Preprocessor
from nlpretext.basic.preprocess import normalize_whitespace, remove_punct, remove_eol_characters, remove_stopwords, \
    lower_text, remove_accents, remove_multiple_spaces_and_strip_text, replace_numbers, replace_emails, replace_urls
from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji

# some useful libraries
# spacy

np.random.seed(42)

In [None]:
text = ['today is a cold day', 
        'yesterday was sunny', 
        'today is really cold', 
        'yesterday was sunny but cold']
    
# data representation
tokenizer = Tokenizer(num_words=20, oov_token='<OOV>')
tokenizer.fit_on_texts(text)
print(tokenizer.word_index)
print()

# vectorize text
txt_vectors = tokenizer.texts_to_sequences(text)
print(txt_vectors)
print()

# making sequences to be of the same length
padded_text_vectors = pad_sequences(txt_vectors, padding='post', maxlen=10, truncating='post')
print(padded_text_vectors)

In [None]:
text_vectorization = TextVectorization(output_mode="int", max_tokens=100)

text_vectorization.adapt(text)

vocab = text_vectorization.get_vocabulary()
print(vocab)

encoded_sentence = text_vectorization("today was a rainy day")
print(encoded_sentence)


In [None]:
# text pre-processing
def clean_text(text):
    preprocessor = Preprocessor()
    preprocessor.pipe(lower_text)
    preprocessor.pipe(remove_mentions)
    preprocessor.pipe(remove_hashtag)
    preprocessor.pipe(remove_emoji)
    preprocessor.pipe(remove_eol_characters)
    preprocessor.pipe(remove_stopwords, args={'lang': 'en'})
    preprocessor.pipe(remove_punct)
    preprocessor.pipe(replace_urls)
    preprocessor.pipe(replace_emails)
    preprocessor.pipe(replace_numbers)
    preprocessor.pipe(remove_accents)
    preprocessor.pipe(remove_multiple_spaces_and_strip_text)
    preprocessor.pipe(normalize_whitespace)

    text = preprocessor.run(text)

    return text

In [None]:
# load yelp review data
df = pd.read_csv('./data/yelp.csv')
print(df.shape)
print(df.columns)
print(df.head())
# review annotations for 5 classes
print(df.stars.value_counts())
# always explore the dataset
print(len(df.text.unique()))
print(df.loc[df.text.duplicated(keep=False), ('text', 'stars')])

# dataframe without the duplicated values
df = df.loc[~df.text.duplicated(keep='first'), ('text', 'stars')].reset_index(drop=True)
print(df.shape)
print(df.info())

In [None]:
%%time
df['clean_text'] = Parallel(n_jobs=cpu_count()-2, backend='multiprocessing')(delayed(clean_text)(row['text']) for _, row in df.iterrows())

df["text_len"] = df.clean_text.apply(lambda x: len(x.split()))
print(df.describe())

df = df.loc[df.text_len >= 10, :].reset_index(drop=True)
print(df.describe())

In [None]:
# split the data
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in sss.split(df, df['stars']):
    df_train = df.loc[train_index]
    df_test = df.loc[test_index]

print(df_train.shape)
print(df_test.shape)
print(df_train.stars.value_counts()/len(df_train))
print(df_test.stars.value_counts()/len(df_test))

In [None]:
train_x = df_train.clean_text.to_numpy()
train_y = df_train.stars.to_numpy()
test_x = df_test.clean_text.to_numpy()
test_y = df_test.stars.to_numpy()

print(Counter(train_y))
print(Counter(test_y))

lbl_encoder = LabelEncoder()
train_y = lbl_encoder.fit_transform(train_y)
test_y = lbl_encoder.transform(test_y)

print(Counter(train_y))
print(Counter(test_y))


In [None]:
text_vectorization = TextVectorization(output_mode="int", max_tokens=700)

text_vectorization.adapt(train_x)

encoded_train_x = text_vectorization(train_x)
encoded_test_x = text_vectorization(test_x)
print(encoded_train_x.shape)
print(encoded_test_x.shape)

In [None]:
vocab_size = 10000
max_length = 450

tokenizer = Tokenizer(num_words=vocab_size, oov_token='[UNK]')
tokenizer.fit_on_texts(train_x)

word_index = tokenizer.word_index

# print(word_index)
print(len(word_index))

# converting train data to a sequence
train_x_seq = tokenizer.texts_to_sequences(train_x)
# print(train_x_seq)
train_x_pad = pad_sequences(train_x_seq, maxlen=max_length, padding="post", truncating="post")

# print(len(train_x_seq[0]))
# print(len(train_x_pad[0]))

# print(train_x_seq[0])
# print(train_x_pad[0])

# converting test data to a sequence
test_x_seq = tokenizer.texts_to_sequences(test_x)
# print(train_x_seq)
test_x_pad = pad_sequences(test_x_seq, maxlen=max_length, padding="post", truncating="post")


print(train_x_pad.shape)
print(train_y.shape)

In [None]:
inputs = keras.Input(shape=(450,))
x = layers.Dense(128, activation="relu")(inputs)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(5, activation="softmax")(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

model.fit(train_x_pad, train_y, validation_split=0.3, epochs=5, batch_size=64)

In [None]:
inputs = keras.Input(shape=(max_length,))
embed_layer = keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=300,
        input_length=max_length
    )(inputs)
x = keras.layers.Conv1D(filters=128, kernel_size=2, activation='relu')(embed_layer)
x = keras.layers.GlobalMaxPooling1D()(x)
x = keras.layers.Dense(64, activation='relu')(x)

outputs = keras.layers.Dense(units=5, activation='softmax')(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics="accuracy")

model.fit(train_x_pad, train_y, validation_split=0.3, epochs=5, batch_size=64)

In [None]:
inputs = keras.Input(shape=(None,))
embed_layer = keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=300,
        mask_zero=True
    )(inputs)
x = keras.layers.Bidirectional(keras.layers.LSTM(64))(embed_layer)
x = keras.layers.Dense(64, activation='relu')(x)
outputs = keras.layers.Dense(units=5, activation='softmax')(x)

model = keras.Model(inputs, outputs)

model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics="accuracy")

model.fit(train_x_pad, train_y, validation_split=0.3, epochs=5, batch_size=64)

In [None]:
# evaluate the model on the test data
test_evals = model.evaluate(test_x_pad, test_y)
print(test_evals)
# predict on the test data
predictions = model.predict(test_x_pad)
predicted_lbls = np.argmax(predictions, axis=1)
cr = classification_report(test_y, predicted_lbls)
print(cr)