In [1]:
import os
import re
from pathlib import Path
from joblib import dump

import numpy as np
import pandas as pd

# Visualization
import seaborn as sns

# Google cloud
from google.oauth2 import service_account
from google.cloud import bigquery

from gensim.models.word2vec import Word2Vec

from keras.preprocessing.text import tokenizer_from_json
from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

from util.text_preprocessing import StopWordRemoverFactory



In [2]:
key_path = '../airflow/credentials/future-data-track-1-sapporo.json'
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

bigquery_client = bigquery.Client(
    project='future-data-track-1',
    credentials=credentials
)

In [3]:
query = """
SELECT * FROM `future-data-track-1.sapporo_mart.topic_modelling`;
"""

query_job = bigquery_client.query(query)
df = query_job.to_dataframe()

In [4]:
df.head()

Unnamed: 0,review_id,review,rating,created_date
0,gp:AOqpTOHajl5LbNjh0p68FTNBcucXHLImtaMplr41M46...,Tolong lah sehari saja berhenti memberikan not...,1,2020-11-09 11:31:09+00:00
1,gp:AOqpTOE9wwn04N2P0KF0lHaitBTIsIt-m0vScdPZqRg...,Ngiklan mulu,1,2021-04-18 12:47:35+00:00
2,gp:AOqpTOEXVM0HpOd0hh2KKQ2npUye4nYaN9YCBTt3lq6...,Be be be,1,2021-04-25 17:48:18+00:00
3,gp:AOqpTOHdVMTWvbJ-6mTQ2q8DLDwOU7Re7-GS9uICWoj...,Iklan asu,1,2021-08-12 08:04:34+00:00
4,gp:AOqpTOFkWvobwA7CgtroIIy-mrzSWYtzvBtU5SXWKPj...,Pengiriman lama. Barang yang di kirim banyak y...,1,2021-05-19 15:51:43+00:00


In [5]:
_sw_remover = StopWordRemoverFactory().create_stop_word_remover()

sw_remover = _sw_remover.remove

In [6]:
def preprocess(value):
    # value = value.lower()
    value = re.sub(r'<.*?>', '', value)
    value = re.sub(r'[^a-zA-Z]', ' ', value)
    value = re.sub(r'\s\s+', ' ', value)
    # value = stemmer(value)
    value = sw_remover(value)
    value = list(filter(lambda x: x, [x.lower() for x in value.split(" ")]))
    return value

In [7]:
df['review'].replace('', float("NaN"))
df.dropna(subset=["review"], inplace=True)

In [8]:
texts =  df['review'].apply(lambda x : preprocess(x))

In [9]:
# pipeline_kmeans = Pipeline([('tfidf', TfidfVectorizer()),
#                     ('kmeans', KMeans())])

In [10]:
# pipeline_kmeans.fit(texts)

In [11]:
tokenizer_file = open('Sentiment Analysis/Tokenizer/tokens_30k.json', 'r')

tokenizer = tokenizer_from_json(tokenizer_file.read())

tokenizer_file.close()

In [12]:
# padding
seq = tokenizer.texts_to_sequences(df['review'])
X = pad_sequences(seq, maxlen=120, padding='post')

In [None]:
EMBEDDING_FILEPATH = 'Sentiment Analysis/Embedding/w2v_emoji_sw_v4.w2v'

In [None]:
embedding = Word2Vec.load(EMBEDDING_FILEPATH)

In [None]:
def get_weight_matrix(embedding: Word2Vec, vocab):
    vocab_size = len(vocab) + 1

    weight_matrix = np.zeros((vocab_size, 100))

    for word, i in vocab.items():
        try:
            weight_matrix[i] = embedding.wv.get_vector(word)
        except:
            continue
    
    return weight_matrix

In [None]:
embedding_vectors = get_weight_matrix(embedding, tokenizer.word_index)

In [None]:
def get_vectors(embedding_vectors, tokens):
    vectors = []
    
    for token_id in tokens:
        try:
            vectors.append(embedding_vectors[token_id])
        except:
            continue
    
    return vectors

In [None]:
X = [get_vectors(embedding_vectors, tokens) for tokens in X]

In [None]:
# model = KMeans()