In [None]:
# install dependency
!pip install kaggle

In [None]:
# upload kaggle.json
from google.colab import files
files.upload()

In [None]:
# buat folder
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# download data dari api kaggle
!kaggle datasets download -d kazanova/sentiment140

In [None]:
# extract data
from zipfile import ZipFile
file_name = "sentiment140.zip"
with ZipFile(file_name, 'r') as zip :
  zip.extractall()
  print('Extracted')

In [None]:
# importing things
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import joblib
from keras.preprocessing.text import Tokenizer
import gensim
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense,LSTM,Dropout
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report

In [None]:
# baca data
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding='latin-1',header=None)
df.head()

In [None]:
# memberikan nama kolom pada data
columns=['target','ids','date','flag','user','text']
df.columns=columns
df = df.drop(['ids', 'date', 'flag', 'user'], axis=1)
df.head()

In [None]:
# ambil random 10%
#df.sample(frac=0.01)
df.sample(n=5000)

In [None]:
df.target.unique()

In [None]:
# download stopwords
nltk.download('stopwords')


In [None]:
# mengubah nilai column target menjadi kategori
df.target.replace({0:'Negative',2:'Neutral',4:'Positive'},inplace=True)
df.head()

In [None]:
# pembersihan data
stop_words=set(stopwords.words('english'))
stop_words.remove('not')
corpus=[]
for i in range(0,len(df)):
    review=re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ',df['text'][i])
    review=review.lower()
    review=review.split()
    review=[word for word in review if not word in stop_words]
    review=' '.join(review)
    corpus.append(review)

In [None]:
# merge hasil pembersihan ke dataset
df.text=corpus
df.head()

In [None]:
# pembagian data training dan test
from sklearn.model_selection import train_test_split
train_df,test_df=train_test_split(df,test_size=0.20,random_state=123)

In [None]:
# cek isi data train
train_df.head()

In [None]:
# cek isi data test
test_df.head()

In [None]:
# implement word to vec
documents = [text.split() for text in train_df.text]
w2v_model = gensim.models.word2vec.Word2Vec(size=300, window=7, min_count=10, workers=8)
w2v_model.build_vocab(documents)
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print(vocab_size)

In [None]:
# training data
w2v_model.train(documents, total_examples=len(documents), epochs=30)

In [None]:
# pengelompokan good
w2v_model.wv.most_similar("good")

In [None]:
# pengelompoan hate
w2v_model.wv.most_similar("hate")

In [None]:
# pengelompokan great
w2v_model.wv.most_similar("great")

In [None]:
# tokenizizing  
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_df.text)

In [None]:
# hasil token
tokenizer.word_index

In [None]:
# jumlah vocab setelah di token
vocab_size=len(tokenizer.word_index)+1
vocab_size

In [None]:
# training testing
X_train = pad_sequences(tokenizer.texts_to_sequences(train_df.text), maxlen=300)
X_train

In [None]:
X_test = pad_sequences(tokenizer.texts_to_sequences(test_df.text), maxlen=300)
X_test

In [None]:
y_train=train_df.target
y_train.head()


In [None]:
y_test=test_df.target
y_test.head()

In [None]:
# mapping kategori ke 0 dan 1 
labelencoder = LabelEncoder()
y_train = labelencoder.fit_transform(y_train)
y_test=labelencoder.fit_transform(y_test)

In [None]:
y_train.shape, y_test.shape

In [None]:
# embedding data ke matrik
embedding_matrix = np.zeros((vocab_size, 300))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
# penambahan embedding layer
embedding_layer = Embedding(vocab_size, 300, weights=[embedding_matrix], input_length=300, trainable=False)

In [None]:
# modeling lstm sequencial
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

# compile 
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])



In [None]:
# buat callback untuk stop training 
import tensorflow as tf
ACCURACY_THRESHOLD = 0.90

class myCallback(tf.keras.callbacks.Callback): 
    def on_epoch_end(self, epoch, logs={}): 
        if(logs.get('accuracy') > ACCURACY_THRESHOLD):   
          print("\nReached %2.2f%% accuracy, so stopping training!!" %(ACCURACY_THRESHOLD*100))   
          self.model.stop_training = True

callbacks = myCallback()

# fit ke model simpan ke history
model_history=model.fit(X_train, y_train,batch_size=1024,epochs=15,validation_split=0.2,verbose=1, callbacks=[callbacks])

In [None]:
# menggabar grafik
acc = model_history.history['accuracy']
val_acc = model_history.history['val_accuracy']
loss = model_history.history['loss']
val_loss = model_history.history['val_loss']
epochs=range(len(acc))

In [None]:
plt.plot(epochs,acc,label='Trainin_acc',color='blue')
plt.plot(epochs,val_acc,label='Validation_acc',color='red')
plt.legend()
plt.title("Training and Validation Accuracy")

In [None]:
plt.plot(epochs,loss,label='Training_loss',color='blue')
plt.plot(epochs,val_loss,label='Validation_loss',color='red')
plt.legend()
plt.title("Training and Validation loss")

In [None]:
# preposer
def preprocess(text):
    review=re.sub('@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+',' ',text)
    review=review.lower()
    review=review.split()
    review=[word for word in review if not word in stop_words]
    print(review)
    review=pad_sequences(tokenizer.texts_to_sequences([review]), maxlen=300)
    return review

In [None]:
# prediksi
def prediction(review):
    review=preprocess(review)
    score=model.predict(review)
    score=score[0]
    if score<0.4:
        print("Negative")
    elif score>0.4 and score<0.6:
        print("Neutral")
    else:
        print("Positive")
    print(score)

In [None]:
prediction("the food is not bad")

In [None]:
prediction("the actors are stunning")

In [None]:
prediction("the movie we watched yesterday screamly unexpected")

In [None]:
prediction("too much money")

In [None]:
prediction("movie screamly crazy")

In [None]:
# evaluasi
scores = model.predict(X_test, verbose=1, batch_size=1024)

In [None]:
y_pred=np.where(scores>0.5,1,0)

In [None]:
y_pred,y_test

In [None]:
# buat report
print(classification_report(y_test, y_pred))