# Sentiment Analysis

In [1]:
import os
import re
from joblib import load, dump

import numpy as np
import pandas as pd

# Visualization
import seaborn as sns

from google.oauth2 import service_account
from google.cloud import bigquery

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

from tensorflow import keras
from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences

from util.model_helper import Model, ModelLoader
from util.ensemble_helper import ModelConcatenate, VotingEnsemble, \
    StackingEnsemble

## Config

In [2]:
sklearn_loader = load
keras_loader = keras.models.load_model

In [3]:
BASE_DIR = os.path.join('Sentiment Analysis')

In [4]:
# Fetch saved model if needed
# UNCOMMENT IF MODEL NOT EXISTS
#!wget -O ./'Sentiment Analysis'/ignore/Baseline_RF_Sentiment_4.0.E/saved_model1.joblib https://storage.googleapis.com/dionricky-static/baseline_rf_4.0.E.joblib

In [5]:
dt_config = {
    "name": "Decission Tree",
    "shortname": "DT",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_DT_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

rf_config = {
    "name": "Random Forest",
    "shortname": "RF",
    "path": os.path.join(BASE_DIR, "ignore",
                        "Baseline_RF_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

lr_config = {
    "name": "Logistic Regression",
    "shortname": "LR",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_LR_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

nb_config = {
    "name": "Naive Bayes",
    "shortname": "NB",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_NB_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

nc_config = {
    "name": "Nearest Centroid",
    "shortname": "NC",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_NC_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

svm_config = {
    "name": "Support Vector Machine",
    "shortname": "SVM",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_SVM_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

knn_config = {
    "name": "K Nearest Neighbor",
    "shortname": "KNN",
    "path": os.path.join(BASE_DIR, "Baseline",
                        "Baseline_KNN_Sentiment_4.0.E", "saved_model.joblib"),
    "version": "4.0.E",
    "loader": sklearn_loader
}

lstm_config = {
    "name": "Long Short-term Memory",
    "shortname": "LSTM",
    "path": os.path.join(BASE_DIR, "LSTM", "LSTM_Sentiment_4.0.E"),
    "version": "4.0.E",
    "loader": keras_loader
}

cnn_config = {
    "name": "Convolutional Neural Network",
    "shortname": "CNN",
    "path": os.path.join(BASE_DIR, "CNN", "CNN_Sentiment_5.0"),
    "version": "5.0.E",
    "loader": keras_loader
}

## Loading Tokenizer

In [6]:
tokenizer_path = os.path.join(BASE_DIR, 'Tokenizer', 'tokens_30k.json')
tokenizer_file = open(tokenizer_path, 'r')

TOKENIZER = tokenizer_from_json(tokenizer_file.read())

tokenizer_file.close()

## Utils

In [7]:
def extract_emojis(sentence):
    return [word for word in sentence.split() if str(word.encode('unicode-escape'))[2] == '\\' ]

In [8]:
def preprocessing_text(text):
    if type(text) == list \
        or type(text) == np.ndarray \
        or type(text) == pd.Series:
        return [preprocessing_text(t) for t in text]

    emojis = ' '.join(extract_emojis(text))
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = text.lower()

    return text + emojis

In [9]:
def keras_preprocess(texts, vector_size=120):
    texts = texts if type(texts) != str else [texts]
    texts = preprocessing_text(texts)
    seq = TOKENIZER.texts_to_sequences(texts)
    return pad_sequences(seq, maxlen=vector_size, padding='post')

def sklearn_preprocess(texts):
    texts = texts if type(texts) != str else [texts]
    texts = preprocessing_text(texts)
    return texts

## Loading Models

In [10]:
# Load Models
DT = ModelLoader(**dt_config)
DT = Model(
        model=DT.load(),
        pred_func= lambda m, x: m.predict_proba(sklearn_preprocess(x)),
        **DT.get_config())

RF = ModelLoader(**rf_config)
RF = Model(
        model=RF.load(),
        pred_func= lambda m, x: m.predict_proba(sklearn_preprocess(x)),
        **RF.get_config())

LR = ModelLoader(**lr_config)
LR = Model(
        model=LR.load(),
        pred_func= lambda m, x: m.predict_proba(sklearn_preprocess(x)),
        **LR.get_config())

NB = ModelLoader(**nb_config)
NB = Model(
        model=NB.load(),
        pred_func= lambda m, x: m.predict_proba(sklearn_preprocess(x)),
        **NB.get_config())

NC = ModelLoader(**nc_config)
NC = Model(
        model=NC.load(),
        pred_func=lambda m, x: m.predict(sklearn_preprocess(x)),
        calc_proba_func=lambda y: np.array([1-y, y]).transpose(),
        **NC.get_config())

SVM = ModelLoader(**svm_config)
SVM = Model(
        model=SVM.load(),
        pred_func=lambda m, x: m.predict(sklearn_preprocess(x)),
        calc_proba_func=lambda y: np.array([1-y, y]).transpose(),
        **SVM.get_config())

KNN = ModelLoader(**knn_config)
KNN = Model(
        model=KNN.load(),
        pred_func=lambda m, x: m.predict_proba(sklearn_preprocess(x)),
        **KNN.get_config())

LSTM = ModelLoader(**lstm_config)
LSTM = Model(
        model=LSTM.load(),
        pred_func= lambda m, x: m.predict( keras_preprocess(x), workers=2 ),
        post_process=lambda y: y.reshape(y.shape[0]),
        calc_proba_func=lambda y: np.array([1-y, y]).transpose(),
        **LSTM.get_config())

CNN = ModelLoader(**cnn_config)
CNN = Model(
        model=CNN.load(),
        pred_func=lambda m, x: m.predict( keras_preprocess(x), workers=2 ),
        post_process=lambda y: y.reshape(y.shape[0]),
        calc_proba_func=lambda y: np.array([1-y, y]).transpose(),
        **CNN.get_config())

## Constructing Ensemble Model

In [11]:
concat = ModelConcatenate([DT, RF, LR, NB, NC, SVM, KNN, LSTM, CNN])

In [12]:
ensemble = VotingEnsemble(concat)

## Make Prediction

### Fetch Data

In [13]:
key_path = '../airflow/credentials/future-data-track-1-sapporo.json'
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

bigquery_client = bigquery.Client(
    project='future-data-track-1',
    credentials=credentials
)

In [14]:
years = [2015, 2016, 2017, 2018, 2019, 2020, 2022] # 2021 is skipped for now

In [15]:
import time

In [16]:
schema = [
    bigquery.SchemaField("review_id", "STRING", "NULLABLE"),
    bigquery.SchemaField("review", "STRING", "NULLABLE"),
    bigquery.SchemaField("sentiment", "STRING", "NULLABLE")
]

dataset_ref = bigquery.DatasetReference('future-data-track-1', 'sapporo_mart')
table_ref = bigquery.TableReference(dataset_ref, 'sentiment_analysis')

table = bigquery.Table(table_ref, schema=schema)
#table = bigquery_client.create_table(table, exists_ok=True)

In [35]:
query = """
SELECT
    review_id,
    review
FROM
    `future-data-track-1.sapporo_mart.app_reviews`
WHERE EXTRACT(YEAR FROM created_date) = 2021 AND EXTRACT(MONTH FROM created_date) >= 9;
"""

query_job = bigquery_client.query(query)
df = query_job.to_dataframe()

df = df.dropna(axis=0)

y = ensemble.predict(df['review'])

df['sentiment'] = y

In [36]:
bigquery_client.insert_rows_from_dataframe(
    table,
    df
)

[[],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
