# Sentiment Analysis Model

change current directory and import packages

In [1]:
%load_ext lab_black
import os
import numpy as np

In [2]:
if not os.path.exists("/sentiment_analysis"):
    os.chdir("..")

import packages

In [3]:
from sentiment_analysis.utils.train_test_split import TrainTestSplit
from sentiment_analysis.models.model import StreamlinedModel
from sentiment_analysis.features.word_frequencies import WordFrequencyVectorizer
from sentiment_analysis.data.review_processor import ReviewProcessor
import lightgbm as lgb

Perform train test split on the reviews data

In [4]:
X_train, y_train, X_test, y_test = TrainTestSplit().get_split_data()

Build lightGBM model and train using the training data

In [8]:
lightgbm = StreamlinedModel(
    transformer_description="word frequency vector",
    transformer=WordFrequencyVectorizer,
    model_description="LightGBM model",
    model=lgb.LGBMClassifier,
    model_params={
        "application": "binary",
        "objective": "binary",
        "metric": "auc",
        "is_unbalance": "false",
        "boosting": "gbdt",
        "num_leaves": 31,
        "feature_fraction": 0.06,
        "bagging_fraction": 0.67,
        "bagging_freq": 1,
        "learning_rate": 0.05,
        "verbose_eval": 0,
        "n_estimators": 2000,
        "n_jobs": 6,
    },
)

train the streamlined model

In [None]:
lightgbm.train(X_train, y_train)

get the predictions

In [None]:
y_pred = lightgbm.predict(X_test)

In [7]:
# X_pred_train = lightgbm.predict(X_train)

In [8]:
sum(y_train)

2680

X_pred_train and X_pred are all 0s, why? We can have a look with the custom transformer

In [9]:
# wf = WordFrequencyVectorizer()
# # X_train_transformed = wf.transform(X_train)

In [10]:
# X_test_transformed = wf.transform(X_test)

In [11]:
parameters =
lgbm = lgb.LGBMClassifier(**parameters)

Try built in CountVectorizer

In [12]:
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

In [23]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
def tokenize_sentence(s):
    """ build customer tokenizer by lower case, lemmatize and remove stopwords """
    # lower case
    s = s.lower()
    # split string into words (tokens)
    tokens = nltk.tokenize.word_tokenize(s)
    # remove short words, they're probably not useful
    tokens = [t for t in tokens if len(t) > 2]
    # put words into base form
    tokens = [wordnet_lemmatizer.lemmatize(t) for t in tokens]
    # remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    return " ".join(tokens)

In [24]:
X_train = [tokenize_sentence(i) for i in X_train]

In [25]:
vectorizer = TfidfVectorizer(max_features = 5000)
X = vectorizer.fit_transform(X_train)
# X_test = vectorizer.transform(X_test)

In [26]:
lgbm.fit(X.toarray(), y_train)

LGBMClassifier(application='binary', bagging_fraction=0.67, bagging_freq=1,
               boosting='gbdt', feature_fraction=0.06, is_unbalance='false',
               learning_rate=0.05, metric='auc', n_estimators=2000, n_jobs=6,
               objective='binary', verbose_eval=0)

In [27]:
lr = LogisticRegression()
lr.fit(X.toarray(), y_train)

LogisticRegression()

In [28]:
y_pred = lgbm.predict(X.toarray())

In [29]:
lr.score(X, y_train)

0.5

We still are only able to get 0.5 AUC, which equivalent to random guessing. Possible reason would be the high level of sparsity existed in the dataset. Most of the reviews are less than 20 words, while the dimensionality of the features after count vectorizer would be at least 20000. Therefore, most of the values in a single vector will be 0s, potentially making lightGBM very hard to fit.

In [20]:
roc_auc_score(y_train, y_pred)

0.5

In [21]:
y_train

array([1, 1, 1, ..., 0, 0, 0])