In [1]:
import re

# Google cloud
from google.oauth2 import service_account
from google.cloud import bigquery

from gensim.models import Word2Vec

import emoji

In [2]:
key_path = '../airflow/credentials/future-data-track-1-sapporo.json'
credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"]
)

bigquery_client = bigquery.Client(
    project='future-data-track-1',
    credentials=credentials
)

query = """
SELECT * FROM sapporo_mart.sentiment_analysis
"""

query_job = bigquery_client.query(query)
df = query_job.to_dataframe()

del query
del query_job

In [3]:
print(df.head())

                                              review  rating  sentiment
0  Apk sangat menganggu, saat lagi nonton anime t...       1          0
1  Skrg ga bisa masukan kode promo untuk byr tagi...       1          0
2  Aplikasi lelet mau daftar akun aja gagal nungg...       1          0
3                             Susah d sowload ny?...       1          0
4  Lupa username,tp cs 3 hari berturut turut zonk...       1          0


In [4]:
df.isna().sum()
df = df.dropna(axis=0)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2716869 entries, 0 to 2716909
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   review     object
 1   rating     int64 
 2   sentiment  int64 
dtypes: int64(2), object(1)
memory usage: 82.9+ MB


In [6]:
def extract_emojis(sentence):
    return re.findall(emoji.get_emoji_regexp(), sentence)

def preprocessing_text(texts):
    emojis = extract_emojis(texts)
    texts = re.sub(r'<.*?>', '', texts)
    texts = re.sub(r'[^a-zA-Z]', ' ', texts)
    texts = list(filter(lambda x: x, [x.lower() for x in texts.split(" ")]))

    return texts + emojis

In [7]:
df['review_cleaned'] = df['review'].apply(lambda x : preprocessing_text(x))

print(df.head())

                                              review  rating  sentiment  \
0  Apk sangat menganggu, saat lagi nonton anime t...       1          0   
1  Skrg ga bisa masukan kode promo untuk byr tagi...       1          0   
2  Aplikasi lelet mau daftar akun aja gagal nungg...       1          0   
3                             Susah d sowload ny?...       1          0   
4  Lupa username,tp cs 3 hari berturut turut zonk...       1          0   

                                      review_cleaned  
0  [apk, sangat, menganggu, saat, lagi, nonton, a...  
1  [skrg, ga, bisa, masukan, kode, promo, untuk, ...  
2  [aplikasi, lelet, mau, daftar, akun, aja, gaga...  
3                            [susah, d, sowload, ny]  
4  [lupa, username, tp, cs, hari, berturut, turut...  


In [8]:
embedding = Word2Vec(df['review_cleaned'], vector_size=100, window=5, workers=2, min_count=2, epochs=100)

In [9]:
embedding.wv.most_similar('developer')

[('dev', 0.6985407471656799),
 ('pengembang', 0.690396249294281),
 ('mimin', 0.6532716155052185),
 ('developernya', 0.65157550573349),
 ('pembuat', 0.6131553053855896),
 ('devoloper', 0.5818763971328735),
 ('devloper', 0.5817750692367554),
 ('development', 0.5784115195274353),
 ('pengelola', 0.5748855471611023),
 ('tim', 0.5703144073486328)]

In [10]:
embedding.save('Sentiment Analysis/Embedding/w2v_emoji_sw_v4.w2v')

In [11]:
embedding.wv.most_similar("kurir")

[('kurirnya', 0.8750789165496826),
 ('expedisi', 0.7901881337165833),
 ('pengirim', 0.7674428224563599),
 ('ekspedisi', 0.7506662011146545),
 ('ekpedisi', 0.7155596017837524),
 ('kulir', 0.6820976138114929),
 ('jne', 0.6645514369010925),
 ('kurirny', 0.658267080783844),
 ('pengirimnya', 0.6571032404899597),
 ('jnt', 0.6560195684432983)]