In [2]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

#from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)



In [3]:
# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE =1024

# SENTIMENT
POSITIVE = "POSITIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.4, 0.7)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"

In [4]:
from google.colab import files
uploaded = files.upload()

Saving detik_vaksin.csv to detik_vaksin.csv


In [6]:
df = pd.read_csv(r'detik_vaksin.csv',sep=',', engine='python')

In [7]:
df.head(5)

Unnamed: 0,text
0,Sebanyak 2.630 tenaga kesehatan (nakes) di RS...
1,Jakarta - American Airlines melalui anak perus...
2,"Jakarta - Pada Selasa (19/1/2021), perusahaan ..."
3,Jakarta - Pangdam Jaya Mayjen Dudung Abdurachm...
4,"Jakarta - Selain Indonesia, beberapa negara me..."


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [9]:
important_words = ['janganlah','kasus','kurang','lagian','masalah','masalahnya','manalagi','mempersoalkan','mempertanyakan','mereka','bukan']

In [10]:
stop_words = set(stopwords.words('indonesian')).difference(important_words)  #for some reason stopwords are not removed i.e ini, dan 

In [11]:
stop_words.remove("tidak")


In [12]:
stop_words.remove("enggak")

In [13]:
stop_words.remove("berlebihan")

In [14]:
stop_words.remove("bermacam")

In [15]:
stop_words.remove('bermacam-macam')

In [16]:
stop_words.remove('jangan')

In [17]:
def preprocess(text, stem=False):
    # Remove link,user and special characters
    text = re.sub(TEXT_CLEANING_RE, ' ', str(text).lower()).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmers.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [18]:
df.text = df.text.apply(lambda x: preprocess(x))

In [19]:
df.head()

Unnamed: 0,text
0,2 630 tenaga kesehatan nakes rsd wisma atlet k...
1,jakarta american airlines anak perusahaannya p...
2,jakarta selasa 19 1 2021 perusahaan bioteknolo...
3,jakarta pangdam jaya mayjen dudung abdurachman...
4,jakarta indonesia negara memilih vaksinasi cov...


In [20]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 7
TEST size: 2


In [21]:
documents = [_text.split() for _text in df_train.text] 

In [24]:
w2v_model = gensim.models.word2vec.Word2Vec(size=W2V_SIZE, 
                                            window=W2V_WINDOW, 
                                            min_count=W2V_MIN_COUNT, 
                                            workers=8)


In [25]:
w2v_model.build_vocab(documents)

2021-01-20 05:58:26,088 : INFO : collecting all words and their counts
2021-01-20 05:58:26,090 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2021-01-20 05:58:26,092 : INFO : collected 438 word types from a corpus of 1048 raw words and 7 sentences
2021-01-20 05:58:26,093 : INFO : Loading a fresh vocabulary
2021-01-20 05:58:26,097 : INFO : effective_min_count=10 retains 20 unique words (4% of original 438, drops 418)
2021-01-20 05:58:26,099 : INFO : effective_min_count=10 leaves 306 word corpus (29% of original 1048, drops 742)
2021-01-20 05:58:26,101 : INFO : deleting the raw counts dictionary of 438 items
2021-01-20 05:58:26,102 : INFO : sample=0.001 downsamples 20 most-common words
2021-01-20 05:58:26,104 : INFO : downsampling leaves estimated 48 word corpus (15.7% of prior 306)
2021-01-20 05:58:26,106 : INFO : estimated required memory for 20 words and 300 dimensions: 58000 bytes
2021-01-20 05:58:26,106 : INFO : resetting layer weights


In [26]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

Vocab size 20


In [27]:
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

2021-01-20 05:58:50,474 : INFO : training model with 8 workers on 20 vocabulary and 300 features, using sg=0 hs=0 sample=0.001 negative=5 window=7
2021-01-20 05:58:50,492 : INFO : worker thread finished; awaiting finish of 7 more threads
2021-01-20 05:58:50,493 : INFO : worker thread finished; awaiting finish of 6 more threads
2021-01-20 05:58:50,494 : INFO : worker thread finished; awaiting finish of 5 more threads
2021-01-20 05:58:50,495 : INFO : worker thread finished; awaiting finish of 4 more threads
2021-01-20 05:58:50,496 : INFO : worker thread finished; awaiting finish of 3 more threads
2021-01-20 05:58:50,497 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-01-20 05:58:50,498 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-01-20 05:58:50,499 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-01-20 05:58:50,500 : INFO : EPOCH - 1 : training on 1048 raw words (51 effective words) took 0.0s, 5127 effective words/

(1530, 33536)

In [28]:
w2v_model.wv.most_similar("vaksin")
#wv.most_similar()

2021-01-20 05:59:28,298 : INFO : precomputing L2-norms of word weight vectors


[('corona', 0.9948046207427979),
 ('sinovac', 0.9945085048675537),
 ('negara', 0.9937106370925903),
 ('vaksinasi', 0.9930845499038696),
 ('orang', 0.9930267333984375),
 ('hamil', 0.9929793477058411),
 ('dudung', 0.9927222728729248),
 ('1', 0.9925806522369385),
 ('china', 0.9925258159637451),
 ('19', 0.9925054907798767)]