<a href="https://colab.research.google.com/github/daudcanugerah/prototype/blob/master/v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install  tensorflow==2.0 scikit-learn emoji matplotlib tensorboard==2.0.0 tensorflow_hub seaborn numpy pandas symspellpy
!pip install git+git://github.com/snowballstem/pystemmer

In [0]:
%load_ext tensorboard
from __future__ import absolute_import, division, print_function, unicode_literals
try:
  # Use the %tensorflow_version magic if in colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
import tensorflow_hub as hub
import string
from google.colab import drive
import re
from sklearn import model_selection, metrics
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from datetime import datetime
import emoji
import json
import io
import Stemmer
import itertools
import os
from itertools import zip_longest
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
drive.mount('/content/drive')

In [0]:
now = datetime.now().strftime("%Y%m%d-%H%M%S")
main_dir = 'drive/My Drive/sentiment/'
log_dir = main_dir+'logs/model/'+now
stemmer = Stemmer.Stemmer('indonesian')
text_writer = tf.summary.create_file_writer(
    main_dir+"logs/text/" + datetime.now().strftime("%Y%m%d-%H%M%S"))
    

In [0]:
def trasform_sentiment(x):
    if(x == -1):
        return 0
    elif(x == 1):
        return 2
    else:
      return 1
 
def custom_stemmer(text):
    text = re.sub(r'http\S+', '', text)
    text = ''.join(i for i in text if not i.isdigit())
    text = re.sub(r'(#|@)\w+', '', text)
    for char in list('"!#$%&\'()*+,-./:;<=>@[\\]^_`{|}~'):
        text = text.replace(char, ' ')
    text =  emoji.demojize(text)
    text =  re.sub(r'((?=:(.*):)(?=:(.*):))', ' ',text)
    text =  re.sub(r'([\?])', r' \1',text)
    text = re.sub(":",'',text)    
    text = text.lower().strip()
    text = re.sub("\s\s+", " ", text)
    return text

def transform_predection_writer(x_test, y_tesx, y_prediction):
    fp = [['***Text***','***Predict***','***Real***']]
    fn = []
    for num, text in enumerate(x_test):
        if(y_test[num] == 0 and y_prediction[num].numpy() != 0):
            fn.append([text,str(y_prediction[num].numpy()),str(y_test[num])])
        elif(y_test[num] == 1 and y_prediction[num].numpy() != 1):
            fn.append([text,str(y_prediction[num].numpy()),str(y_test[num])])
    return fp,fn

In [0]:
datasets = pd.read_json(main_dir+'datasets/data_training.json')
# print('filter by category')
datasets = datasets[datasets['category'] == 'telco']
# print('filter by sentiment')
print('transform sentiment')
datasets['sentiment'] = datasets['sentiment'].apply(trasform_sentiment)
print('custom stimmer')
datasets['text'] = datasets['text'].apply(custom_stemmer)
print('drop na')
datasets.dropna()

In [0]:
X = datasets['text'].values
Y = datasets['sentiment'].values

x_train, x_test, y_train, y_test = model_selection.train_test_split(
    X, Y, test_size=0.3)
x_train, x_eval, y_train, y_eval = model_selection.train_test_split(
    x_train, y_train, test_size=0.1)

In [0]:
hub_layer = hub.KerasLayer(
        "https://tfhub.dev/google/nnlm-id-dim128-with-normalization/2", input_shape=[], dtype=tf.string, trainable=True)
model = Sequential()
model.add(hub_layer)
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(16, activation='relu',kernel_regularizer=tf.keras.regularizers.l2(0.001)))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(3, activation='softmax'))
# load weight
model.load_weights(main_dir+"checkpoint/weights.best.loss.hdf5")
# sgd = tf.keras.optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [0]:
history = model.fit(
    x_train,
    y_train,
    epochs=100,
    verbose=1,
    validation_data=[x_test, y_test],
    callbacks=[
        tf.keras.callbacks.TensorBoard(log_dir=log_dir),
        tf.keras.callbacks.ModelCheckpoint(main_dir+"checkpoint/weights.best.loss.hdf5", monitor='val_loss', verbose=1, save_best_only=True, mode='auto'),
        tf.keras.callbacks.EarlyStopping(monitor='val_loss')
    ]
)

In [0]:
model.evaluate(
    x_eval,
    y_eval,
    verbose=2,
    batch_size=64,
)

In [0]:
prediction = model.predict(x_test)
y_prediction = tf.argmax(prediction, 1)
matrix_confusion = metrics.confusion_matrix(y_test, y_prediction)
report = metrics.classification_report(y_test, y_prediction)
print(report)

plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix')
matrix_confusion = pd.DataFrame(matrix_confusion,index=['negative','neutral', 'positive'],columns=['negative','neutral', 'positive'])
sns.heatmap(matrix_confusion, annot=True, fmt="d", cbar=True)

In [0]:

# save model
model.save(main_dir+'model/model-3label-v1.h5')
# with open('model/model-tfidf-'+category+'-'+now+'.pkl', 'wb') as f:
#     tfidf = pickle.dump(vectorizer, f)


In [0]:
# Plot training & validation accuracy values
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [0]:
print(metrics.classification_report(y_test, y_prediction, target_names=['negative','neutral', 'positive']))

In [0]:
# from google.colab import files
df = pd.DataFrame({'text':x_test,'real':y_test,'predict':y_prediction.numpy()})
# files.download('df.csv')
# save heatmap