In [2]:
import numpy as np
import pandas as pd
import math

In [3]:
!git clone https://github.com/fajarmuslim/indonesian_sentiment_analysis.git

Cloning into 'indonesian_sentiment_analysis'...
remote: Enumerating objects: 33, done.[K
remote: Counting objects: 100% (33/33), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 33 (delta 16), reused 20 (delta 7), pack-reused 0[K
Unpacking objects: 100% (33/33), done.


In [4]:
!cd indonesian_sentiment_analysis && ls

LICENSE  README.md  aspect  data_sentences.csv	train_sentiment.txt


In [5]:
def get_review_tokens(filename):
    idx, raw, formalized = [], [], []
    with open(filename, encoding='utf-8') as f:
        i = 0
        for line in f:
            line = line.rstrip()
            if line:
                raw_token, formalized_token = line.split('\t')
                raw.append(raw_token)
                formalized.append(formalized_token)
                idx.append(i)
            else :
                i = i+1

    return idx, raw, formalized

In [6]:
idx, raw, formalized = get_review_tokens("./indonesian_sentiment_analysis/train_sentiment.txt")

In [7]:
def construct_dataset(idx, raw, formalized):
  new_data = {'sentence_idx':idx, 'word':raw, 'label':formalized}
  df = pd.DataFrame(new_data)
  return df 

In [8]:
data = construct_dataset(idx, raw, formalized)
data.head(20)

Unnamed: 0,sentence_idx,word,label
0,0,kamar,O
1,0,saya,O
2,0,ada,O
3,0,kendala,O
4,0,di,O
5,0,ac,B-ASPECT
6,0,tidak,B-SENTIMENT
7,0,berfungsi,I-SENTIMENT
8,0,optimal,I-SENTIMENT
9,0,.,O


In [165]:
print(data.shape)

(63105, 3)


In [9]:
#clean data
for i in range (len(data)):
    if(data['label'][i]=="B-ASPECT" or data['label'][i]=="I-ASPECT"):
        data['label'][i] = 'O'

data = data[data.word != '.']
data = data[data.word != ',']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [32]:
data.head(20)

Unnamed: 0,sentence_idx,word,label
0,0,kamar,O
1,0,saya,O
2,0,ada,O
3,0,kendala,O
4,0,di,O
5,0,ac,O
6,0,tidak,B-SENTIMENT
7,0,berfungsi,I-SENTIMENT
8,0,optimal,I-SENTIMENT
10,0,dan,O


In [10]:
data['label'].value_counts()

O              38476
B-SENTIMENT     9646
I-SENTIMENT     4265
Name: label, dtype: int64

In [11]:
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["label"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
          
getter = SentenceGetter(data)
sentences = getter.sentences

In [12]:
len(sentences)

4000

In [13]:
from math import nan

words = list(set(data["word"].values))
n_words = len(words)

tags = []
for tag in set(data["label"].values):
    if tag is nan or isinstance(tag, float):
        tags.append('unk')
    else:
        tags.append(tag)
n_tags = len(tags)


In [14]:
from future.utils import iteritems

word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
idx2tag = {v: k for k, v in iteritems(tag2idx)}


In [15]:
tag2idx

{'B-SENTIMENT': 0, 'I-SENTIMENT': 1, 'O': 2}

In [16]:
idx2tag

{0: 'B-SENTIMENT', 1: 'I-SENTIMENT', 2: 'O'}

In [17]:
percentage_split = 0.1

In [18]:
sentences_train = sentences[:len(sentences)-int(len(sentences)*percentage_split)]
sentences_validation = sentences_train[len(sentences_train)-int(len(sentences_train)*percentage_split):]
sentences_test = sentences[len(sentences)-int(len(sentences)*percentage_split):]

In [30]:
sentences_train[1]

[('tempatnya', 'O'),
 ('bagus', 'B-SENTIMENT'),
 ('kolam', 'O'),
 ('renangnya', 'O'),
 ('bersih', 'B-SENTIMENT')]

In [19]:
len(sentences_test)

400

In [21]:
maxlen = max([len(s) for s in sentences])
print("maxlen", maxlen)

maxlen 103


In [22]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

X_train = [[word2idx[w[0]] for w in s] for s in sentences_train]
X_train = pad_sequences(maxlen=maxlen, sequences=X_train, padding="post",value=n_words - 1)

X_test = [[word2idx[w[0]] for w in s] for s in sentences_test]
X_test = pad_sequences(maxlen=maxlen, sequences=X_test, padding="post",value=n_words - 1)

X_validation = [[word2idx[w[0]] for w in s] for s in sentences_validation]
X_validation = pad_sequences(maxlen=maxlen, sequences=X_validation, padding="post",value=n_words - 1)

y_train = [[tag2idx[w[1]] for w in s] for s in sentences_train]
y_train = pad_sequences(maxlen=maxlen, sequences=y_train, padding="post", value=tag2idx["O"])
y_train = [to_categorical(i, num_classes=n_tags) for i in y_train]

y_test = [[tag2idx[w[1]] for w in s] for s in sentences_test]
y_test = pad_sequences(maxlen=maxlen, sequences=y_test, padding="post", value=tag2idx["O"])
y_test = [to_categorical(i, num_classes=n_tags) for i in y_test]

y_validation = [[tag2idx[w[1]] for w in s] for s in sentences_validation]
y_validation = pad_sequences(maxlen=maxlen, sequences=y_validation, padding="post", value=tag2idx["O"])
y_validation = [to_categorical(i, num_classes=n_tags) for i in y_validation]

In [23]:
X_train.shape

(3600, 103)

In [24]:
X_test.shape

(400, 103)

In [25]:
!pip install git+https://www.github.com/keras-team/keras-contrib.git

Collecting git+https://www.github.com/keras-team/keras-contrib.git
  Cloning https://www.github.com/keras-team/keras-contrib.git to /tmp/pip-req-build-xmumjhzj
  Running command git clone -q https://www.github.com/keras-team/keras-contrib.git /tmp/pip-req-build-xmumjhzj
Building wheels for collected packages: keras-contrib
  Building wheel for keras-contrib (setup.py) ... [?25ldone
[?25h  Created wheel for keras-contrib: filename=keras_contrib-2.0.8-cp36-none-any.whl size=101065 sha256=9d2bc6484e79687841006b769814a8a7f67f52883a5c6494fae8367b68da37d1
  Stored in directory: /tmp/pip-ephem-wheel-cache-_b5viacu/wheels/11/27/c8/4ed56de7b55f4f61244e2dc6ef3cdbaff2692527a2ce6502ba
Successfully built keras-contrib
Installing collected packages: keras-contrib
Successfully installed keras-contrib-2.0.8


## Model 1

In [207]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF

input = Input(shape=(maxlen,))
word_embedding_size = 150

# Embedding Layer
model = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=maxlen)(input)

# BI-LSTM Layer
model = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model)
model = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.5, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model)

# TimeDistributed Layer
model = TimeDistributed(Dense(n_tags, activation="relu"))(model)  

# CRF Layer
crf = CRF(n_tags)

out = crf(model)  # output
model = Model(input, out)

In [208]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])

model.summary()

# Saving the best model only
filepath="opinion-model-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

# Fit the best model
history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=20, validation_split=0.1, verbose=1, callbacks=callbacks_list)
# history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=20, validation_split=0.1, verbose=1)





Model: "model_16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_17 (InputLayer)        (None, 103)               0         
_________________________________________________________________
embedding_17 (Embedding)     (None, 103, 150)          695400    
_________________________________________________________________
bidirectional_18 (Bidirectio (None, 103, 300)          361200    
_________________________________________________________________
lstm_43 (LSTM)               (None, 103, 300)          721200    
_________________________________________________________________
time_distributed_16 (TimeDis (None, 103, 3)            903       
_________________________________________________________________
crf_16 (CRF)                 (None, 103, 3)            27        
Total params: 1,778,730
Trainable params: 1,778,730
Non-trainable params: 0
________________________________________________

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3240 samples, validate on 360 samples
Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.96745, saving model to opinion-model-0.97.hdf5
Epoch 2/20

Epoch 00002: val_accuracy did not improve from 0.96745
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.96745
Epoch 4/20

Epoch 00004: val_accuracy did not improve from 0.96745
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.96745
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.96745
Epoch 7/20

Epoch 00007: val_accuracy did not improve from 0.96745
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.96745
Epoch 9/20

Epoch 00009: val_accuracy did not improve from 0.96745
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.96745
Epoch 11/20

Epoch 00011: val_accuracy did not improve from 0.96745
Epoch 12/20

Epoch 00012: val_accuracy did not improve from 0.96745
Epoch 13/20

Epoch 00013: val_accuracy improved from 0.96745 to 0.96753, saving model to opinion-model-0

In [26]:
def pred2label(pred):
    out = []
    for pred_i in pred:
        out_i = []
        for p in pred_i:
            p_i = np.argmax(p)
            out_i.append(idx2tag[p_i])
        out.append(out_i)
    return out

In [27]:
! pip install seqeval

Collecting seqeval
[?25l  Downloading https://files.pythonhosted.org/packages/9d/2d/233c79d5b4e5ab1dbf111242299153f3caddddbb691219f363ad55ce783d/seqeval-1.2.2.tar.gz (43kB)
[K     |████████████████████████████████| 51kB 1.7MB/s eta 0:00:011
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25ldone
[?25h  Created wheel for seqeval: filename=seqeval-1.2.2-cp36-none-any.whl size=16172 sha256=84836976fa2055649e3e5d781ac856be04309d7d42e464b16cd900fdc37cbae9
  Stored in directory: /root/.cache/pip/wheels/52/df/1b/45d75646c37428f7e626214704a0e35bd3cfc32eda37e59e5f
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [211]:
# validation
validation_pred = model.predict(X_validation, verbose=1)   
pred_labels = pred2label(validation_pred)
validation_labels = pred2label(y_validation)



In [212]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score  validation: {:.1%}".format(f1_score(validation_labels, pred_labels)))

F1-score  validation: 37.5%


In [213]:
# testing
test_pred = model.predict(X_test, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)



In [214]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score  testing: {:.1%}".format(f1_score(test_labels, pred_labels)))

F1-score  testing: 45.0%


## Model 2

In [28]:
from keras.models import Model, Input
from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional
import keras as k
from keras_contrib.layers import CRF

input = Input(shape=(maxlen,))
word_embedding_size = 150

# Embedding Layer
model2 = Embedding(input_dim=n_words, output_dim=word_embedding_size, input_length=maxlen)(input)

# BI-LSTM Layer
model2 = Bidirectional(LSTM(units=word_embedding_size, 
                           return_sequences=True, 
                           dropout=0.5, 
                           recurrent_dropout=0.5, 
                           kernel_initializer=k.initializers.he_normal()))(model2)

model2 = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.1, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model2)

model2 = LSTM(units=word_embedding_size * 2, 
             return_sequences=True, 
             dropout=0.1, 
             recurrent_dropout=0.5, 
             kernel_initializer=k.initializers.he_normal())(model2)

# TimeDistributed Layer
model2 = TimeDistributed(Dense(n_tags, activation="relu"))(model2)  

# CRF Layer
crf = CRF(n_tags)

out = crf(model2)  # output
model2 = Model(input, out)

In [216]:
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

#Optimiser 
adam = k.optimizers.Adam(lr=0.0005, beta_1=0.9, beta_2=0.999)

# Compile model
model2.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy'])

model2.summary()

# Saving the best model only
filepath="opinion-model2-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

history = model2.fit(X_train, np.array(y_train), batch_size=256, epochs=20, validation_split=0.1, verbose=1, callbacks=callbacks_list)




Model: "model_17"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 103)               0         
_________________________________________________________________
embedding_18 (Embedding)     (None, 103, 150)          695400    
_________________________________________________________________
bidirectional_19 (Bidirectio (None, 103, 300)          361200    
_________________________________________________________________
lstm_45 (LSTM)               (None, 103, 300)          721200    
_________________________________________________________________
lstm_46 (LSTM)               (None, 103, 300)          721200    
_________________________________________________________________
time_distributed_17 (TimeDis (None, 103, 3)            903       
_________________________________________________________________
crf_17 (CRF)                 (None, 103, 3)            27 

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 3240 samples, validate on 360 samples
Epoch 1/20

Epoch 00001: val_accuracy improved from -inf to 0.01615, saving model to opinion-model2-0.02.hdf5
Epoch 2/20

Epoch 00002: val_accuracy did not improve from 0.01615
Epoch 3/20

Epoch 00003: val_accuracy did not improve from 0.01615
Epoch 4/20

Epoch 00004: val_accuracy did not improve from 0.01615
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.01615
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.01615
Epoch 7/20

Epoch 00007: val_accuracy did not improve from 0.01615
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.01615
Epoch 9/20

Epoch 00009: val_accuracy did not improve from 0.01615
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.01615
Epoch 11/20

Epoch 00011: val_accuracy improved from 0.01615 to 0.96745, saving model to opinion-model2-0.97.hdf5
Epoch 12/20

Epoch 00012: val_accuracy did not improve from 0.96745
Epoch 13/20

Epoch 00013: val_accuracy did not improve fr

In [217]:
# validation
validation_pred = model2.predict(X_validation, verbose=1)   
pred_labels = pred2label(validation_pred)
validation_labels = pred2label(y_validation)



In [29]:
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
print("F1-score  validation: {:.1%}".format(f1_score(validation_labels, pred_labels)))

NameError: name 'validation_labels' is not defined

In [219]:
# testing
test_pred = model2.predict(X_test, verbose=1)   
pred_labels = pred2label(test_pred)
test_labels = pred2label(y_test)



In [220]:
print("F1-score  testing: {:.1%}".format(f1_score(test_labels, pred_labels)))

F1-score  testing: 0.0%
