In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from multiprocessing import cpu_count
from joblib import Parallel, delayed
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.dummy import DummyClassifier

from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import nltk
from nltk.sentiment import SentimentAnalyzer, SentimentIntensityAnalyzer

from nlpretext import Preprocessor
from nlpretext.basic.preprocess import normalize_whitespace, remove_punct, remove_eol_characters, remove_stopwords, \
    lower_text, remove_accents, remove_multiple_spaces_and_strip_text, replace_numbers, replace_emails, replace_urls
from nlpretext.social.preprocess import remove_mentions, remove_hashtag, remove_emoji

# some useful libraries
# spacy

np.random.seed(42)

In [3]:
# data representation
text = ['today is a cold day', 
        'yesterday was sunny', 
        'today is really cold', 
        'yesterday was sunny but cold']

count_vect = CountVectorizer(ngram_range=(1, 1))
tfidf_vect = TfidfVectorizer()

doc_term_matrix = count_vect.fit_transform(text) 
df_count = pd.DataFrame(data=doc_term_matrix.toarray(), columns=count_vect.get_feature_names_out())

doc_term_matrix = tfidf_vect.fit_transform(text)
df_tfidf = pd.DataFrame(data=doc_term_matrix.toarray(), columns=tfidf_vect.get_feature_names_out())
print(df_tfidf)

print(count_vect.get_feature_names_out())
print(tfidf_vect.get_feature_names_out())

print(count_vect.vocabulary_)

        but      cold       day        is    really     sunny     today  \
0  0.000000  0.392053  0.614226  0.484263  0.000000  0.000000  0.484263   
1  0.000000  0.000000  0.000000  0.000000  0.000000  0.577350  0.000000   
2  0.000000  0.392053  0.000000  0.484263  0.614226  0.000000  0.484263   
3  0.552816  0.352855  0.000000  0.000000  0.000000  0.435847  0.000000   

        was  yesterday  
0  0.000000   0.000000  
1  0.577350   0.577350  
2  0.000000   0.000000  
3  0.435847   0.435847  
['but' 'cold' 'day' 'is' 'really' 'sunny' 'today' 'was' 'yesterday']
['but' 'cold' 'day' 'is' 'really' 'sunny' 'today' 'was' 'yesterday']
{'today': 6, 'is': 3, 'cold': 1, 'day': 2, 'yesterday': 8, 'was': 7, 'sunny': 5, 'really': 4, 'but': 0}


In [4]:
# text pre-processing
def clean_text(text):
    preprocessor = Preprocessor()
    preprocessor.pipe(lower_text)
    preprocessor.pipe(remove_mentions)
    preprocessor.pipe(remove_hashtag)
    preprocessor.pipe(remove_emoji)
    preprocessor.pipe(remove_eol_characters)
    preprocessor.pipe(remove_stopwords, args={'lang': 'en'})
    preprocessor.pipe(remove_punct)
    preprocessor.pipe(replace_urls)
    preprocessor.pipe(replace_emails)
    preprocessor.pipe(replace_numbers)
    preprocessor.pipe(remove_accents)
    preprocessor.pipe(remove_multiple_spaces_and_strip_text)
    preprocessor.pipe(normalize_whitespace)

    text = preprocessor.run(text)

    return text

text = ['today is a cold day.', 
        'was it sunny?', 
        'today is.... really cold', 
        'yesterday was sunny but cold :-(']
        
cleaned_text = clean_text(text[0])
print(cleaned_text)

today cold day


In [5]:
# load yelp review data
df = pd.read_csv('./data/yelp.csv')
print(df.shape)
print(df.columns)
print(df.head())
# review annotations for 5 classes
print(df.stars.value_counts())
# always explore the dataset
print(len(df.text.unique()))
print(df.loc[df.text.duplicated(keep=False), ('text', 'stars')])

# dataframe without the duplicated values
df = df.loc[~df.text.duplicated(keep='first'), ('text', 'stars')].reset_index(drop=True)
print(df.shape)
print(df.info())

(10000, 10)
Index(['business_id', 'date', 'review_id', 'stars', 'text', 'type', 'user_id',
       'cool', 'useful', 'funny'],
      dtype='object')
              business_id        date               review_id  stars  \
0  9yKzy9PApeiPPOUJEtnvkg  2011-01-26  fWKvX83p0-ka4JS3dc6E5A      5   
1  ZRJwVLyzEJq1VAihDhYiow  2011-07-27  IjZ33sJrzXqU-0X6U8NwyA      5   
2  6oRAC4uyJCsJl1X0WZpVSA  2012-06-14  IESLBzqUCLdSzSqm0eCSxQ      4   
3  _1QQZuf4zZOyFCvXc0o6Vg  2010-05-27  G-WvGaISbqqaMHlNnByodA      5   
4  6ozycU1RpktNG2-1BroVtw  2012-01-05  1uJFq2r5QfJG_6ExMRCaGw      5   

                                                text    type  \
0  My wife took me here on my birthday for breakf...  review   
1  I have no idea why some people give bad review...  review   
2  love the gyro plate. Rice is so good and I als...  review   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...  review   
4  General Manager Scott Petello is a good egg!!!...  review   

                  user_id  cool  u

In [6]:
%%time
df['clean_text'] = df.text.apply(lambda x: clean_text(x))
print(df.head())

                                                text  stars  \
0  My wife took me here on my birthday for breakf...      5   
1  I have no idea why some people give bad review...      5   
2  love the gyro plate. Rice is so good and I als...      4   
3  Rosie, Dakota, and I LOVE Chaparral Dog Park!!...      5   
4  General Manager Scott Petello is a good egg!!!...      5   

                                          clean_text  
0  wife birthday breakfast excellent weather perf...  
1  idea people give bad reviews place show gripin...  
2      love gyro plate rice good dig candy selection  
3  rosie dakota love chaparral dog park s conveni...  
4  general manager scott petello good egg detail ...  
CPU times: total: 34.6 s
Wall time: 34.6 s


In [7]:
%%time
df['clean_text'] = Parallel(n_jobs=cpu_count()-2, backend='multiprocessing')(delayed(clean_text)(row['text']) for _, row in df.iterrows())
print(df.head())

In [None]:
# pd.set_option("max_colwidth", None)
# pd.reset_option("max_colwidth")
print(df.text.to_list()[0])
print()
print(df.clean_text.to_list()[0])

In [None]:
# split the data
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)

for train_index, test_index in sss.split(df, df['stars']):
    df_train = df.loc[train_index]
    df_test = df.loc[test_index]

print(df_train.shape)
print(df_test.shape)
print(df_train.stars.value_counts()/len(df_train))
print(df_test.stars.value_counts()/len(df_test))

In [None]:
# using bag-of-words
count_vect = CountVectorizer(ngram_range=(1, 1), max_features=10000)
train_count = count_vect.fit_transform(df_train.clean_text.to_numpy())
print(len(count_vect.vocabulary_))
vocab_dict = {value: key for key, value in count_vect.vocabulary_.items()}
print(vocab_dict[3])

In [None]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 1), max_features=10000)
train_tfidf = tfidf_vect.fit_transform(df_train.clean_text.to_numpy())
print(len(tfidf_vect.vocabulary_))
vocab_dict = {value: key for key, value in tfidf_vect.vocabulary_.items()}
print(vocab_dict[3])

In [None]:
train_x = df_train.clean_text.to_numpy()
train_y = df_train.stars.to_numpy()
test_x = df_test.clean_text.to_numpy()
test_y = df_test.stars.to_numpy()

print(Counter(train_y))
print(Counter(test_y))

lbl_encoder = LabelEncoder()
train_y = lbl_encoder.fit_transform(train_y)
test_y = lbl_encoder.transform(test_y)

print(Counter(train_y))
print(Counter(test_y))

# the pipeline with the data transformation and classifer
pipeline = Pipeline(
    [
        ("vect", tfidf_vect),
        ("clf", SVC(class_weight='balanced', random_state=42)),
        # ("clf", RandomForestClassifier(class_weight='balanced', random_state=42)),
    ]
)

# the parameters to be used
parameters = {
    'vect__max_features': (10000, 15000),
    "vect__ngram_range": ((1, 1), (1, 2)),
    "clf__C": (1, 10),
    "clf__kernel": ("linear", "rbf"),
    # 'clf__n_estimators': [100, 200], 
    # 'clf__max_depth': [3, 5], 
    # 'clf__bootstrap': [True, False]
}

cv = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
scoring = {"bal_accuracy": "balanced_accuracy", "F1_macro": "f1_macro"}
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, cv=cv, scoring=scoring, refit="F1_macro", return_train_score=True)
grid_search.fit(df_train.clean_text.to_numpy(), train_y)

In [None]:
# print the best score
print(grid_search.best_score_)
# print the best parameters
print(grid_search.best_params_)
# the best estimator
print(grid_search.best_estimator_)
# can also get the best results for each run with different feature combinations
cross_val_results = grid_search.cv_results_

In [None]:
test_predictions = grid_search.best_estimator_.predict(df_test.clean_text.to_numpy())
cr_test = classification_report(test_y, test_predictions)
cm_test = confusion_matrix(test_y, test_predictions)

print(cr_test)
print(cm_test)

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent')
dummy_clf.fit(train_x, train_y)
y_pred = dummy_clf.predict(test_x)
print(y_pred)