In [140]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

#from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [141]:
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE =1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [290]:
df = pd.read_csv(r'raffi_niafinal.csv',sep=',', engine='python')

In [291]:
df.head(5)

Unnamed: 0,text
0,"Secara tidak langsung, kita bisa membuka mata ..."
1,ang dari awal gak pernah muji Nia Ramadhani ng...
2,nia km.kok gerutu d atas panggung d dpn audien...
3,aku lebih suka ka sandra dewi walau tajir meli...
4,gw dr dulu liat nia jijay....sok iyes..sok kay...


In [292]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/shaan/snap/jupyter/6/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [293]:
important_words = ['janganlah','kasus','kurang','lagian','masalah','masalahnya','manalagi','mempersoalkan','mempertanyakan','mereka','bukan','tidak','enggak','bermacam-macam','jangan']

In [294]:
new_words = ['d','tau','klo','km','lg','dgn','deh','sih','jg','aja','lu','iya','udah','lu','theblues19','the','gw','yg','kalo','kau','nya','ga','bs','jd']

In [295]:
stop_words = set(stopwords.words('indonesian'))

In [296]:
stop_words = stop_words.union(new_words)

In [297]:
stop_words = stop_words.difference(important_words)  

In [298]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmers.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [299]:
df.text = df.text.apply(lambda x: preprocess(x))

In [300]:
df.head(20)

Unnamed: 0,text
0,tidak langsung membuka mata defisini cantik wa...
1,ang gak muji nia ramadhani ngumpul
2,nia gerutu panggung dpn audience malu knapa ra...
3,suka ka sandra dewi tajir melintir humble keli...
4,dr liat nia jijay sok iyes sok kaya suaminya k...
5,cantik doang gak kerja profesional
6,keribetan bajunya lagian diem2 dirumah nyonya ...
7,raffi ma emang pengalaman host handal host dia...
8,suruh pamer kekayaan suaminya bkn kekayaan ya ...
9,ngehost bawain acara alay beda bu nia lihat2 d...


In [301]:
df_train, df_test = train_test_split(df, test_size=0.01, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 325
TEST size: 4


In [302]:
documents = [_text.split() for _text in df_train.text] 

In [318]:
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=5, 
                                            workers=8)


In [319]:
w2v_model.build_vocab(documents)

2021-02-02 16:15:10,273 : INFO : collecting all words and their counts
2021-02-02 16:15:10,274 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-02-02 16:15:10,276 : INFO : collected 1354 word types from a corpus of 3061 raw words and 325 sentences
2021-02-02 16:15:10,276 : INFO : Loading a fresh vocabulary
2021-02-02 16:15:10,277 : INFO : effective_min_count=5 retains 120 unique words (8% of original 1354, drops 1234)
2021-02-02 16:15:10,278 : INFO : effective_min_count=5 leaves 1340 word corpus (43% of original 3061, drops 1721)
2021-02-02 16:15:10,279 : INFO : deleting the raw counts dictionary of 1354 items
2021-02-02 16:15:10,279 : INFO : sample=0.001 downsamples 120 most-common words
2021-02-02 16:15:10,280 : INFO : downsampling leaves estimated 592 word corpus (44.2% of prior 1340)
2021-02-02 16:15:10,280 : INFO : estimated required memory for 120 words and 300 dimensions: 348000 bytes
2021-02-02 16:15:10,281 : INFO : resetting layer weights


In [320]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 120


In [321]:
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

2021-02-02 16:15:10,730 : INFO : training model with 8 workers on 120 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2021-02-02 16:15:10,734 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-02 16:15:10,735 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-02 16:15:10,736 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-02 16:15:10,737 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-02 16:15:10,737 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-02 16:15:10,737 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-02 16:15:10,738 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-02 16:15:10,738 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-02 16:15:10,739 : INFO : EPOCH - 1 : training on 3061 raw words (565 effective words) took 0.0s, 106203 effective wo

2021-02-02 16:15:10,803 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-02 16:15:10,803 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-02 16:15:10,803 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-02 16:15:10,804 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-02 16:15:10,804 : INFO : EPOCH - 10 : training on 3061 raw words (580 effective words) took 0.0s, 128568 effective words/s
2021-02-02 16:15:10,808 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-02 16:15:10,808 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-02 16:15:10,808 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-02 16:15:10,809 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-02 16:15:10,809 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-02 16:15:10,810 : INFO : worker thread fin

2021-02-02 16:15:10,875 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-02 16:15:10,876 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-02 16:15:10,876 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-02 16:15:10,877 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-02 16:15:10,877 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-02 16:15:10,877 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-02 16:15:10,878 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-02 16:15:10,878 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-02 16:15:10,878 : INFO : EPOCH - 20 : training on 3061 raw words (594 effective words) took 0.0s, 149889 effective words/s
2021-02-02 16:15:10,882 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-02 16:15:10,883 : INFO : worker thread fin

2021-02-02 16:15:10,940 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-02 16:15:10,940 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-02-02 16:15:10,940 : INFO : EPOCH - 29 : training on 3061 raw words (565 effective words) took 0.0s, 135184 effective words/s
2021-02-02 16:15:10,944 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-02-02 16:15:10,945 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-02-02 16:15:10,945 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-02-02 16:15:10,945 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-02-02 16:15:10,946 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-02-02 16:15:10,946 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-02-02 16:15:10,946 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-02-02 16:15:10,947 : INFO : worker thread fin

(18817, 97952)

In [326]:
w2v_model.wv.most_similar("attitude")
#wv.most_similar()

[('host', 0.9996014833450317),
 ('nia', 0.9995881915092468),
 ('gak', 0.9995589852333069),
 ('kupas', 0.9995568990707397),
 ('pinter', 0.9995520710945129),
 ('dewi', 0.99955153465271),
 ('pas', 0.9995356798171997),
 ('org', 0.9995273351669312),
 ('ngehost', 0.9995207786560059),
 ('tp', 0.9995201826095581)]

In [308]:
w2v_model.wv.vocab

{'g': <gensim.models.keyedvectors.Vocab at 0x7efe884720b8>,
 'pilih': <gensim.models.keyedvectors.Vocab at 0x7efe88472320>,
 'host': <gensim.models.keyedvectors.Vocab at 0x7efe86bf79e8>,
 'pas': <gensim.models.keyedvectors.Vocab at 0x7efe8816fcf8>,
 'bukan': <gensim.models.keyedvectors.Vocab at 0x7efe8816f978>,
 'cantik': <gensim.models.keyedvectors.Vocab at 0x7efe880f6ef0>,
 'otak': <gensim.models.keyedvectors.Vocab at 0x7efe880f6ac8>,
 'smart': <gensim.models.keyedvectors.Vocab at 0x7efe880f6a90>,
 'banget': <gensim.models.keyedvectors.Vocab at 0x7efe880f6278>,
 'nia': <gensim.models.keyedvectors.Vocab at 0x7efe880f6240>,
 'keliatan': <gensim.models.keyedvectors.Vocab at 0x7efe880f6da0>,
 'mending': <gensim.models.keyedvectors.Vocab at 0x7efe880f6a58>,
 'belajar': <gensim.models.keyedvectors.Vocab at 0x7efe880f67b8>,
 'jgn': <gensim.models.keyedvectors.Vocab at 0x7efe880f6b38>,
 'pamer': <gensim.models.keyedvectors.Vocab at 0x7efe880f6630>,
 'sok': <gensim.models.keyedvectors.Vocab a