In [13]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [2]:
df = pd.read_excel('dataset/Data 1.xlsx', names=['comment', 'polarity'])
df.shape

(152, 2)

In [3]:
df.head()

Unnamed: 0,comment,polarity
0,min bnyk yg kecewa lo dgn update terbaru alih ...,1
1,user id password mesti ke bank ya gpplah yg pe...,1
2,saat transfer kadang ada muncul keterangan kon...,1
3,begitu saya update dan no tlpn saya statusnya ...,1
4,tolong tambahkan fitur fingerprint atau face r...,1


## Split Dataset

In [4]:
df.polarity.value_counts()/df.shape[0]

0    0.552632
1    0.447368
Name: polarity, dtype: float64

In [34]:
kfold = KFold(n_splits=5, random_state=0)
for train, test in kfold.split(df.comment, df.polarity):
    print('Train', df.iloc[train, 1].value_counts() / len(train))
    print('Test', df.iloc[test, 1].value_counts() / len(test))
    print('===================')

Train 0    0.694215
1    0.305785
Name: polarity, dtype: float64
Test 1    1.0
Name: polarity, dtype: float64
Train 0    0.628099
1    0.371901
Name: polarity, dtype: float64
Test 1    0.741935
0    0.258065
Name: polarity, dtype: float64
Train 1    0.557377
0    0.442623
Name: polarity, dtype: float64
Test 0    1.0
Name: polarity, dtype: float64
Train 0    0.508197
1    0.491803
Name: polarity, dtype: float64
Test 0    0.733333
1    0.266667
Name: polarity, dtype: float64
Train 1    0.508197
0    0.491803
Name: polarity, dtype: float64
Test 0    0.8
1    0.2
Name: polarity, dtype: float64


In [32]:
# split and save index of each batch train-test
skf = StratifiedKFold(n_splits=5, random_state=0)

i = 1
for train, test in skf.split(df.comment, df.polarity):
    np.save(f'dataset/train_{i}', train)
    np.save(f'dataset/test_{i}', test)
    print(f'train {i}\n', df.iloc[train, 1].value_counts()/len(train))
    print(f'test {i}\n', df.iloc[test, 1].value_counts()/len(test))
    print('===================')
    i += 1

train 1
 0    0.553719
1    0.446281
Name: polarity, dtype: float64
test 1
 0    0.548387
1    0.451613
Name: polarity, dtype: float64
train 2
 0    0.553719
1    0.446281
Name: polarity, dtype: float64
test 2
 0    0.548387
1    0.451613
Name: polarity, dtype: float64
train 3
 0    0.557377
1    0.442623
Name: polarity, dtype: float64
test 3
 0    0.533333
1    0.466667
Name: polarity, dtype: float64
train 4
 0    0.54918
1    0.45082
Name: polarity, dtype: float64
test 4
 0    0.566667
1    0.433333
Name: polarity, dtype: float64
train 5
 0    0.54918
1    0.45082
Name: polarity, dtype: float64
test 5
 0    0.566667
1    0.433333
Name: polarity, dtype: float64




## Preprocessing

<ol>
    <li>Case folding <b>(done at previous notebook)</b></li>
    <li>Cleansing <b>(done at previous notebook)</b></li>
    <li>Formalization</li>
    <li>Stemming</li>
    <li>Stopword Removal</li>
    <li>Tokenizing</li>
</ol>
    

### Formalization (Manual)

In [50]:
formal_dict = {}
with open('resources/formalization_dict.txt', 'r') as file:
    i = 1
    for row in file:
        old, new = row.split('\t')
        i += 1
        formal_dict[old] = new.lower().strip()

print(f'There are {len(formal_dict)} token pairs')

There are 51 token pairs


In [53]:
formal_comment = []

for comment in df.comment:
    sentence = ' '+comment+' '
    for false_word, true_word in formal_dict.items():
        word = ' '+false_word+' '
        sentence = sentence.replace(word, ' '+true_word+' ')
    formal_comment.append(sentence)
    
print(f'We have {len(formal_comment)} comments')

We have 152 comments


### Stemming (Sastrawi)

In [56]:
stemmer = StemmerFactory().create_stemmer()
comment_stemmed = [stemmer.stem(formal_comment[i]) for i in range(df.shape[0])]

comment_stemmed[0]

'min bnyk yang kecewa lo dengan update baru alih alih sempurna malah susah nasabah mandiri masuk saya agar kurang tindak tipu jahat waktu ada transaksi yang lebih rb rp maka bisa di tambah security upa kirim nomor verifikasi yang kirim ke nomor hp sms banking trus harus di masuk dalam applikasi mandiri online agar benar bahwa si nasabah sedang laku transaksi dengan demikian pasti tetap aman mohon perhati ya min terimakasih'

In [57]:
formal_comment[0]

' min bnyk yang kecewa lo dengan update terbaru alih alih penyempurnaan malah menyusahkan nasabah mandiri masukan saya agar mengurangi tindak penipuan kejahatan sewaktu ada transaksi yang lebih rb rp maka bisa di tambahkan security berupa pengiriman nomor verifikasi yang dikirimkan ke nomor hp sms banking trus harus di masukkan dalam applikasi mandiri online agar benar bahwa si nasabah sedang melakukan transaksi dengan demikian pasti tetap aman mohon diperhatikan ya min terimakasih '

### Stopwords Removal (Manual)

In [58]:
stopwords = [
    'yang', 'untuk', 'pada', 'antara', 'dan' , 'di', 'dari', 'hal', 
    'dalam', 'atau', 'kah', 'pun', 'dsb', 'dst', 'dll', 'toh', 'ya',
    'saya', 'dengan', 'nya', 'ke', 'si', 'dah'
]

print(f'There are {len(stopwords)} stopword list')

There are 23 stopword list


In [66]:
clean_comment = []
for comment in comment_stemmed:
    for token in stopwords:
        word = ' '+token+' '
        comment = comment.replace(word, ' ')
    if sentence.strip():
        clean_comment.append(comment.strip())
        
print(f'We have {len(clean_comment)} comments')

We have 152 comments


In [67]:
clean_comment[0]

'min bnyk kecewa lo update baru alih alih sempurna malah susah nasabah mandiri masuk agar kurang tindak tipu jahat waktu ada transaksi lebih rb rp maka bisa tambah security upa kirim nomor verifikasi kirim nomor hp sms banking trus harus masuk applikasi mandiri online agar benar bahwa nasabah sedang laku transaksi demikian pasti tetap aman mohon perhati min terimakasih'

### Tokenizing

In [68]:
token_series = [comment.split() for comment in clean_comment]
token_series[:2]

[['min',
  'bnyk',
  'kecewa',
  'lo',
  'update',
  'baru',
  'alih',
  'alih',
  'sempurna',
  'malah',
  'susah',
  'nasabah',
  'mandiri',
  'masuk',
  'agar',
  'kurang',
  'tindak',
  'tipu',
  'jahat',
  'waktu',
  'ada',
  'transaksi',
  'lebih',
  'rb',
  'rp',
  'maka',
  'bisa',
  'tambah',
  'security',
  'upa',
  'kirim',
  'nomor',
  'verifikasi',
  'kirim',
  'nomor',
  'hp',
  'sms',
  'banking',
  'trus',
  'harus',
  'masuk',
  'applikasi',
  'mandiri',
  'online',
  'agar',
  'benar',
  'bahwa',
  'nasabah',
  'sedang',
  'laku',
  'transaksi',
  'demikian',
  'pasti',
  'tetap',
  'aman',
  'mohon',
  'perhati',
  'min',
  'terimakasih'],
 ['user',
  'id',
  'password',
  'mesti',
  'bank',
  'gpplah',
  'penting',
  'aman',
  'transaksi']]

## Modelling