In [1]:
import os
import re
from pathlib import Path

from google.oauth2 import service_account
from google.cloud import bigquery

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import emoji

## Fetching Data

In [2]:
key_path = '../airflow/credentials/future-data-track-1-sapporo.json'
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

bigquery_client = bigquery.Client(
    project='future-data-track-1',
    credentials=credentials
)

In [3]:
query = """
SELECT * FROM sapporo_mart.sentiment_analysis
"""

query_job = bigquery_client.query(query)
df = query_job.to_dataframe()

In [4]:
df = df.dropna(axis=0)

## Cleaning

In [5]:
def extract_emojis(sentence):
    return re.findall(emoji.get_emoji_regexp(), sentence)

In [6]:
def preprocessing_text(texts):
    emojis = extract_emojis(texts)
    texts = re.sub(r'<.*?>', '', texts)
    texts = re.sub(r'[^a-zA-Z]', ' ', texts)
    texts = list(filter(lambda x: x, [x.lower() for x in texts.split(" ")]))

    return texts + emojis

In [7]:
df['review_cleaned'] = df['review'].apply(lambda x : preprocessing_text(x))

In [8]:
df.head()

Unnamed: 0,review,rating,sentiment,review_cleaned
0,"Apk sangat menganggu, saat lagi nonton anime t...",1,0,"[apk, sangat, menganggu, saat, lagi, nonton, a..."
1,Skrg ga bisa masukan kode promo untuk byr tagi...,1,0,"[skrg, ga, bisa, masukan, kode, promo, untuk, ..."
2,Aplikasi lelet mau daftar akun aja gagal nungg...,1,0,"[aplikasi, lelet, mau, daftar, akun, aja, gaga..."
3,Susah d sowload ny?...,1,0,"[susah, d, sowload, ny]"
4,"Lupa username,tp cs 3 hari berturut turut zonk...",1,0,"[lupa, username, tp, cs, hari, berturut, turut..."


In [9]:
tokenizer = Tokenizer(num_words=50000)

In [None]:
tokenizer.fit_on_texts(df['review_cleaned'])
seq = tokenizer.texts_to_sequences(df['review_cleaned'])
X = pad_sequences(seq, maxlen=120, padding='post')

In [None]:
path = 'Sentiment Analysis/Tokenizer/tokens_50k_120.json'

Path(os.path.dirname(path)).mkdir(parents=True, exist_ok=True)

with open(path, 'w+') as _f:
    _f.write(tokenizer.to_json())