# Sentiment analysis

In [1]:
import numpy as np
import pandas as pd
from cytoolz import *
from tqdm.auto import tqdm

tqdm.pandas()

In [2]:
df = pd.read_parquet('s3://ling583/sentiment.parquet', storage_options={'anon': True})

-----

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
train, test = train_test_split(
    df, test_size=0.2, stratify=df["sentiment"], random_state=619
)

In [5]:
train.head()

Unnamed: 0,title,text,date_stayed,date,service,cleanliness,overall,value,location,sleep_quality,rooms,locality,name,sentiment
30367,“The Place to Stay in Manhattan! Simply Superb!”,I have been to New york before and stayed in t...,December 2008,2009-06-18,4.0,5.0,5.0,5.0,5.0,,5.0,New York City,Wingate by Wyndham Manhattan Midtown,good
33593,"“Howard Johnson, Phoenix”",The hotel is located one mile from Sky Harbor ...,January 2012,2012-01-17,4.0,3.0,3.0,3.0,5.0,2.0,3.0,Phoenix,Howard Johnson Phoenix Airport/Downtown Area,bad
34560,“Pier 5 hotel was unlike any other hotel we ha...,Most hotels and hotel rooms look the same but ...,September 2009,2009-10-09,4.0,5.0,4.0,4.0,5.0,,5.0,Baltimore,Pier 5 Hotel,good
44168,"“New York's best kept secret...well, not so se...","Excellent rooms, wonderful service......value ...",July 2011,2011-09-04,5.0,5.0,5.0,5.0,,5.0,,New York City,On The Ave Hotel,good
49971,“Cheerful Location-Best Value.”,We recently stayed for 5 nights in Hotel 140. ...,March 2012,2012-03-23,4.0,4.0,4.0,5.0,5.0,4.0,4.0,Boston,Hotel 140,good


In [6]:
train["sentiment"].value_counts()

good    29287
bad     10713
Name: sentiment, dtype: int64

#### Tokenize the review text

In [7]:
import spacy

nlp = spacy.load(
    "en_core_web_sm",
    exclude=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"],
)


def tokenize(text):
    doc = nlp.tokenizer(text)
    return [t.norm_ for t in doc if t.is_alpha]

In [8]:
import multiprocessing as mp

In [9]:
with mp.Pool() as p:
    train["tokens"] = list(p.imap(tokenize, tqdm(train["text"]), chunksize=100))
    test["tokens"] = list(p.imap(tokenize, tqdm(test["text"]), chunksize=100))

  0%|          | 0/40000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["tokens"] = list(p.imap(tokenize, tqdm(train["text"]), chunksize=100))


  0%|          | 0/10000 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["tokens"] = list(p.imap(tokenize, tqdm(test["text"]), chunksize=100))


In [10]:
train.head()

Unnamed: 0,title,text,date_stayed,date,service,cleanliness,overall,value,location,sleep_quality,rooms,locality,name,sentiment,tokens
30367,“The Place to Stay in Manhattan! Simply Superb!”,I have been to New york before and stayed in t...,December 2008,2009-06-18,4.0,5.0,5.0,5.0,5.0,,5.0,New York City,Wingate by Wyndham Manhattan Midtown,good,"[i, have, been, to, new, york, before, and, st..."
33593,"“Howard Johnson, Phoenix”",The hotel is located one mile from Sky Harbor ...,January 2012,2012-01-17,4.0,3.0,3.0,3.0,5.0,2.0,3.0,Phoenix,Howard Johnson Phoenix Airport/Downtown Area,bad,"[the, hotel, is, located, one, mile, from, sky..."
34560,“Pier 5 hotel was unlike any other hotel we ha...,Most hotels and hotel rooms look the same but ...,September 2009,2009-10-09,4.0,5.0,4.0,4.0,5.0,,5.0,Baltimore,Pier 5 Hotel,good,"[most, hotels, and, hotel, rooms, look, the, s..."
44168,"“New York's best kept secret...well, not so se...","Excellent rooms, wonderful service......value ...",July 2011,2011-09-04,5.0,5.0,5.0,5.0,,5.0,,New York City,On The Ave Hotel,good,"[excellent, rooms, wonderful, service, value, ..."
49971,“Cheerful Location-Best Value.”,We recently stayed for 5 nights in Hotel 140. ...,March 2012,2012-03-23,4.0,4.0,4.0,5.0,5.0,4.0,4.0,Boston,Hotel 140,good,"[we, recently, stayed, for, nights, in, hotel,..."


### TfidfTransformer 

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import make_pipeline

In [12]:
sgd = make_pipeline(CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier())
sgd.fit(train["tokens"], train["sentiment"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["sentiment"], predicted))

              precision    recall  f1-score   support

         bad       0.87      0.74      0.80      2678
        good       0.91      0.96      0.93      7322

    accuracy                           0.90     10000
   macro avg       0.89      0.85      0.87     10000
weighted avg       0.90      0.90      0.90     10000



Initially, the baseline model have a really good F1 scores for accuracy (.90), marco average (.87), and weighted average (.90). This model seems to be really good already. However, we can do hyperparameter to see if we can get higher F1 scores. 

#### Hyperparameter search

In [16]:
import mlflow
from dask_ml.model_selection import RandomizedSearchCV
from logger import log_search
from scipy.stats.distributions import loguniform, randint, uniform

In [14]:
from warnings import simplefilter

simplefilter(action="ignore", category=FutureWarning)

In [17]:
from dask.distributed import Client

client = Client("tcp://127.0.0.1:38723")
client

0,1
Client  Scheduler: tcp://127.0.0.1:38723  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.62 GB


In [18]:
mlflow.set_experiment("lab-8")
sgd = make_pipeline(
    CountVectorizer(analyzer=identity), TfidfTransformer(), SGDClassifier()
)

INFO: 'lab-8' does not exist. Creating a new experiment


In [19]:
%%time

search = RandomizedSearchCV(
    sgd,
    {
        "countvectorizer__min_df": randint(1, 20),
        "countvectorizer__max_df": uniform(0.5, 0.5),
        "tfidftransformer__use_idf": [True, False],
        "sgdclassifier__alpha": loguniform(1e-6, 1e-2),
    },
    n_iter=50,
    scoring="f1_macro",
)
search.fit(train["tokens"], train["sentiment"])
log_search(search)

CPU times: user 10.6 s, sys: 1.32 s, total: 11.9 s
Wall time: 3min 27s


### Final Classifier

In [21]:
sgd = make_pipeline(
    CountVectorizer(analyzer=identity, min_df=2, max_df=0.87),
    TfidfTransformer(use_idf=True),
    SGDClassifier(alpha=6.14e-05),
)
sgd.fit(train["tokens"], train["sentiment"])
predicted = sgd.predict(test["tokens"])
print(classification_report(test["sentiment"], predicted))

              precision    recall  f1-score   support

         bad       0.87      0.74      0.80      2678
        good       0.91      0.96      0.93      7322

    accuracy                           0.90     10000
   macro avg       0.89      0.85      0.87     10000
weighted avg       0.90      0.90      0.90     10000



After optimizing and using the best parameters that it offers, F1 scores from the optimized model are the same as the original baseline model. Therefore, we will go ahead and use our baseline model to be our final classifier for setiment. 