In [100]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn.naive_bayes import MultinomialNB
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import product

In [2]:
df = pd.read_excel('dataset/Data 1.xlsx', names=['comment', 'polarity'])
df.shape

(152, 2)

In [3]:
df.head()

Unnamed: 0,comment,polarity
0,min bnyk yg kecewa lo dgn update terbaru alih ...,1
1,user id password mesti ke bank ya gpplah yg pe...,1
2,saat transfer kadang ada muncul keterangan kon...,1
3,begitu saya update dan no tlpn saya statusnya ...,1
4,tolong tambahkan fitur fingerprint atau face r...,1


## Preprocessing

<ol>
    <li>Case folding <b>(done at previous notebook)</b></li>
    <li>Cleansing <b>(done at previous notebook)</b></li>
    <li>Formalization</li>
    <li>Stemming</li>
    <li>Stopword Removal</li>
    <li>Tokenizing</li>
</ol>
    

### Formalization (Manual)

In [4]:
formal_dict = {}
with open('resources/formalization_dict.txt', 'r') as file:
    i = 1
    for row in file:
        old, new = row.split('\t')
        i += 1
        formal_dict[old] = new.lower().strip()

print(f'There are {len(formal_dict)} token pairs')

There are 51 token pairs


In [5]:
formal_comment = []

for comment in df.comment:
    sentence = ' '+comment+' '
    for false_word, true_word in formal_dict.items():
        word = ' '+false_word+' '
        sentence = sentence.replace(word, ' '+true_word+' ')
    formal_comment.append(sentence)
    
print(f'We have {len(formal_comment)} comments')

We have 152 comments


### Stemming (Sastrawi)

In [6]:
stemmer = StemmerFactory().create_stemmer()
comment_stemmed = [stemmer.stem(formal_comment[i]) for i in range(df.shape[0])]

comment_stemmed[0]

'min bnyk yang kecewa lo dengan update baru alih alih sempurna malah susah nasabah mandiri masuk saya agar kurang tindak tipu jahat waktu ada transaksi yang lebih rb rp maka bisa di tambah security upa kirim nomor verifikasi yang kirim ke nomor hp sms banking trus harus di masuk dalam applikasi mandiri online agar benar bahwa si nasabah sedang laku transaksi dengan demikian pasti tetap aman mohon perhati ya min terimakasih'

In [7]:
formal_comment[0]

' min bnyk yang kecewa lo dengan update terbaru alih alih penyempurnaan malah menyusahkan nasabah mandiri masukan saya agar mengurangi tindak penipuan kejahatan sewaktu ada transaksi yang lebih rb rp maka bisa di tambahkan security berupa pengiriman nomor verifikasi yang dikirimkan ke nomor hp sms banking trus harus di masukkan dalam applikasi mandiri online agar benar bahwa si nasabah sedang melakukan transaksi dengan demikian pasti tetap aman mohon diperhatikan ya min terimakasih '

### Stopwords Removal (Manual)

In [8]:
stopwords = [
    'yang', 'untuk', 'pada', 'antara', 'dan' , 'di', 'dari', 'hal', 
    'dalam', 'atau', 'kah', 'pun', 'dsb', 'dst', 'dll', 'toh', 'ya',
    'saya', 'dengan', 'nya', 'ke', 'si', 'dah'
]

print(f'There are {len(stopwords)} stopword list')

There are 23 stopword list


In [9]:
clean_comment = []
for comment in comment_stemmed:
    for token in stopwords:
        word = ' '+token+' '
        comment = comment.replace(word, ' ')
    if sentence.strip():
        clean_comment.append(comment.strip())
        
print(f'We have {len(clean_comment)} comments')

We have 152 comments


In [10]:
clean_comment[0]

'min bnyk kecewa lo update baru alih alih sempurna malah susah nasabah mandiri masuk agar kurang tindak tipu jahat waktu ada transaksi lebih rb rp maka bisa tambah security upa kirim nomor verifikasi kirim nomor hp sms banking trus harus masuk applikasi mandiri online agar benar bahwa nasabah sedang laku transaksi demikian pasti tetap aman mohon perhati min terimakasih'

### Tokenizing

In [84]:
features = np.array([np.array(comment.split()) for comment in clean_comment])
features[:2]

array([array(['min', 'bnyk', 'kecewa', 'lo', 'update', 'baru', 'alih', 'alih',
       'sempurna', 'malah', 'susah', 'nasabah', 'mandiri', 'masuk',
       'agar', 'kurang', 'tindak', 'tipu', 'jahat', 'waktu', 'ada',
       'transaksi', 'lebih', 'rb', 'rp', 'maka', 'bisa', 'tambah',
       'security', 'upa', 'kirim', 'nomor', 'verifikasi', 'kirim',
       'nomor', 'hp', 'sms', 'banking', 'trus', 'harus', 'masuk',
       'applikasi', 'mandiri', 'online', 'agar', 'benar', 'bahwa',
       'nasabah', 'sedang', 'laku', 'transaksi', 'demikian', 'pasti',
       'tetap', 'aman', 'mohon', 'perhati', 'min', 'terimakasih'],
      dtype='<U11'),
       array(['user', 'id', 'password', 'mesti', 'bank', 'gpplah', 'penting',
       'aman', 'transaksi'], dtype='<U9')], dtype=object)

## Split Dataset


> <b>Warning</b>: Don't run this code if you already have separated dataset before

In [69]:
df.polarity.value_counts()/df.shape[0]

0    0.552632
1    0.447368
Name: polarity, dtype: float64

In [86]:
labels = np.array(df.polarity)
len(labels)

152

In [72]:
kfold = KFold(n_splits=5, random_state=0)
for train, test in kfold.split(features, labels):
    print('Train', df.iloc[train, 1].value_counts() / len(train))
    print('Test', df.iloc[test, 1].value_counts() / len(test))
    print('===================')

Train 0    0.694215
1    0.305785
Name: polarity, dtype: float64
Test 1    1.0
Name: polarity, dtype: float64
Train 0    0.628099
1    0.371901
Name: polarity, dtype: float64
Test 1    0.741935
0    0.258065
Name: polarity, dtype: float64
Train 1    0.557377
0    0.442623
Name: polarity, dtype: float64
Test 0    1.0
Name: polarity, dtype: float64
Train 0    0.508197
1    0.491803
Name: polarity, dtype: float64
Test 0    0.733333
1    0.266667
Name: polarity, dtype: float64
Train 1    0.508197
0    0.491803
Name: polarity, dtype: float64
Test 0    0.8
1    0.2
Name: polarity, dtype: float64




In [73]:
# split and save index of each batch train-test
skf = StratifiedKFold(n_splits=5, random_state=0)

i = 1
for train, test in skf.split(df.comment, df.polarity):
    np.save(f'dataset/train_{i}', train)
    np.save(f'dataset/test_{i}', test)
    print(f'train {i}\n', df.iloc[train, 1].value_counts()/len(train))
    print(f'test {i}\n', df.iloc[test, 1].value_counts()/len(test))
    print('===================')
    i += 1

train 1
 0    0.553719
1    0.446281
Name: polarity, dtype: float64
test 1
 0    0.548387
1    0.451613
Name: polarity, dtype: float64
train 2
 0    0.553719
1    0.446281
Name: polarity, dtype: float64
test 2
 0    0.548387
1    0.451613
Name: polarity, dtype: float64
train 3
 0    0.557377
1    0.442623
Name: polarity, dtype: float64
test 3
 0    0.533333
1    0.466667
Name: polarity, dtype: float64
train 4
 0    0.54918
1    0.45082
Name: polarity, dtype: float64
test 4
 0    0.566667
1    0.433333
Name: polarity, dtype: float64
train 5
 0    0.54918
1    0.45082
Name: polarity, dtype: float64
test 5
 0    0.566667
1    0.433333
Name: polarity, dtype: float64


## Modelling

### Multinomial Naive Bayes

In [141]:
NUM_BATCHES = 5
features = np.array(clean_comment)
labels = np.array(df.polarity)
smoothing_parameter = [1.0, .1, .01, .001]
train_eval, test_eval = [], []

for i in range(NUM_BATCHES):

    train_idx = np.load(f'dataset/train_{i+1}.npy')
    test_idx = np.load(f'dataset/test_{i+1}.npy')
    vectorizer = CountVectorizer()
    
    # 
    train_features = vectorizer.fit_transform(features[train_idx])
    test_features = vectorizer.transform(features[test_idx])
    train_labels = labels[train_idx]
    test_labels = labels[test_idx]
    
    for param in smoothing_parameter:
        clf = MultinomialNB(alpha=param)
        clf.fit(train_features, train_labels)
        train_acc = clf.score(train_features, train_labels)
        test_acc = clf.score(test_features, test_labels)
        
        train_eval.append(train_acc)
        test_eval.append(test_acc)

print('Train and evaluate model completed')

Train and evaluate model completed


In [147]:
NUM_PARAMS = len(smoothing_parameter)
train_history = [[] for i in range(NUM_PARAMS)]
test_history = [[] for i in range(NUM_PARAMS)]

for i in range(len(train_eval)):
    idx = i % NUM_PARAMS
    train_history[idx].append(train_eval[i])
    test_history[idx].append(test_eval[i])

# append the average accuracy
for i in range(NUM_PARAMS):
    train_history[i].append(sum(train_history[i])/NUM_BATCHES)
    test_history[i].append(sum(test_history[i])/NUM_BATCHES)

if len(train_history[-1]) == NUM_BATCHES+1:
    print('Done')

Done


In [159]:
eval_history = {}
for i in range(NUM_PARAMS):
    eval_history[f'Train Acc (a={smoothing_parameter[i]})'] = train_history[i] 
    eval_history[f'Test Acc (a={smoothing_parameter[i]})'] = test_history[i]
    
history = pd.DataFrame(eval_history, index=['Batch-1', 'Batch-2', 'Batch-3', 
                                            'Batch-4', 'Batch-5', 'Average'])
history

Unnamed: 0,Train Acc (a=1.0),Test Acc (a=1.0),Train Acc (a=0.1),Test Acc (a=0.1),Train Acc (a=0.01),Test Acc (a=0.01),Train Acc (a=0.001),Test Acc (a=0.001)
Batch-1,0.975207,0.741935,1.0,0.677419,1.0,0.709677,1.0,0.677419
Batch-2,0.950413,0.903226,0.975207,0.774194,0.975207,0.612903,0.975207,0.612903
Batch-3,0.97541,0.8,0.97541,0.766667,0.983607,0.733333,0.983607,0.7
Batch-4,0.967213,0.766667,0.983607,0.833333,0.991803,0.833333,0.991803,0.8
Batch-5,0.97541,0.4,0.991803,0.333333,0.991803,0.3,0.991803,0.3
Average,0.968731,0.722366,0.985205,0.676989,0.988484,0.637849,0.988484,0.618065


the lower the alpha value, the lower the accuracy value on the test data (Overfitting)

We got <b>the best result</b> from Multinomial Naive Bayes Model with <b>alpha = 1.0</b> that is: <h3>72.24%</h3>

### Multinomial Naive Bayes + TF-IDF

<h3>Important</h3>
<ol>
    <li>ngram_range</li>
    <li>max_df: occurred in too many documents(common word)</li>
    <li>min_df: occurred in too few documents (typo, alay)</li>
</ol>

In [94]:
ngram = [(1, 1), (1, 2), (1, 3)]
min_df = [0.0, .1, .2, .3]
max_df = [.4, .5, .6, .7 , .8, .9, 1.0]

param_combinations = list(product(ngram, min_df, max_df))
param_combinations[:3]

[((1, 1), 0.0, 0.4), ((1, 1), 0.0, 0.5), ((1, 1), 0.0, 0.6)]

In [95]:
NUM_BATCHES = 5
features = np.array(clean_comment)
labels = np.array(df.polarity)
train_eval, test_eval = [], []

for i in range(NUM_BATCHES):

    train_idx = np.load(f'dataset/train_{i+1}.npy')
    test_idx = np.load(f'dataset/test_{i+1}.npy')
    train_labels = labels[train_idx]
    test_labels = labels[test_idx]
    
    for param in param_combinations:
        tfidf_vectorizer = TfidfVectorizer(ngram_range=param[0], min_df=param[1], max_df=param[2])
        train_features = tfidf_vectorizer.fit_transform(features[train_idx])
        test_features = tfidf_vectorizer.transform(features[test_idx])
        
        # train and evaluate
        clf = MultinomialNB()
        clf.fit(train_features, train_labels)
        train_acc = clf.score(train_features, train_labels)
        test_acc = clf.score(test_features, test_labels)
        
        train_eval.append(train_acc)
        test_eval.append(test_acc)
        
print('Train and evaluate model completed')

Train and evaluate model completed


In [96]:
NUM_PARAMS = len(param_combinations)
train_history = [[] for i in range(NUM_PARAMS)]
test_history = [[] for i in range(NUM_PARAMS)]

for i in range(len(train_eval)):
    idx = i % NUM_PARAMS
    train_history[idx].append(train_eval[i])
    test_history[idx].append(test_eval[i])

# append the average accuracy
for i in range(NUM_PARAMS):
    train_history[i].append(sum(train_history[i])/NUM_BATCHES)
    test_history[i].append(sum(test_history[i])/NUM_BATCHES)

if len(train_history[-1]) == NUM_BATCHES+1:
    print('Done')

Done


In [97]:
eval_history = {}
for i in range(NUM_PARAMS):
    eval_history[f'Train Acc (ngram={param_combinations[i][0]}), min_df={param_combinations[i][1]}, max_df={param_combinations[i][2]}'] = train_history[i] 
    eval_history[f'Test Acc (ngram={param_combinations[i][0]}), min_df={param_combinations[i][1]}, max_df={param_combinations[i][2]}'] = test_history[i]
    
history_tfidf = pd.DataFrame(eval_history, index=['Batch-1', 'Batch-2', 'Batch-3', 
                                                  'Batch-4', 'Batch-5', 'Average'])
history_tfidf

Unnamed: 0,"Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.5","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.5","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.6","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.6","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.7","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.7","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.8","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.8",...,"Train Acc (ngram=(1, 3)), min_df=0.3, max_df=0.6","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=0.6","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=0.7","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=0.7","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=0.8","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=0.8","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=0.9","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=0.9","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0"
Batch-1,0.991736,0.709677,0.991736,0.709677,0.991736,0.709677,0.991736,0.709677,0.991736,0.709677,...,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484
Batch-2,0.975207,0.774194,0.975207,0.774194,0.975207,0.774194,0.975207,0.774194,0.975207,0.774194,...,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484
Batch-3,0.97541,0.7,0.97541,0.7,0.97541,0.7,0.97541,0.7,0.97541,0.7,...,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333
Batch-4,0.97541,0.8,0.97541,0.733333,0.97541,0.733333,0.97541,0.733333,0.97541,0.733333,...,0.868852,0.9,0.868852,0.9,0.868852,0.9,0.868852,0.9,0.868852,0.9
Batch-5,0.959016,0.466667,0.959016,0.533333,0.959016,0.533333,0.959016,0.533333,0.959016,0.533333,...,0.868852,0.6,0.868852,0.6,0.868852,0.6,0.868852,0.6,0.868852,0.6
Average,0.975356,0.690108,0.975356,0.690108,0.975356,0.690108,0.975356,0.690108,0.975356,0.690108,...,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086


In [98]:
test_col = [col for col in history_tfidf.columns if col.startswith('Test')]
test_col[history_tfidf.loc['Average', test_col].argmax()]

'Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5'

In [99]:
history_tfidf[['Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5', 'Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5']]

Unnamed: 0,"Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5","Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5"
Batch-1,0.834711,0.935484
Batch-2,0.826446,0.935484
Batch-3,0.860656,0.933333
Batch-4,0.868852,0.9
Batch-5,0.868852,0.6
Average,0.851904,0.86086



We got <b>the best result</b> from Multinomial Naive Bayes Model with 
<b>minimum</b> of word's occurrences is <b>30%</b> of total documents, 
<b>maximum</b> of word's occurrences is <b>50%</b> of total documents, 
and <b>only use 1 gram</b> that is: <h3>86.01%</h3>

### Multinomial Naive Bayes +  Information Gain

In [166]:
train_idx = np.load(f'dataset/train_1.npy')
test_idx = np.load(f'dataset/test_1.npy')
train_labels = labels[train_idx]
test_labels = labels[test_idx]

vectorizer = CountVectorizer()
train_count_features = vectorizer.fit_transform(features[train_idx])
test_count_features = vectorizer.transform(features[test_idx])

ig_res = dict(zip(vectorizer.get_feature_names(),
                  mutual_info_classif(train_count_features, train_labels, discrete_features=True)
               ))
ig_res = sorted(ig_res.items(), key=lambda x: x[1], reverse=True)
ig_res[:5]

[('aman', 0.3195094428637498),
 ('bahaya', 0.1436441992148773),
 ('hati', 0.07445484833759905),
 ('kalau', 0.07350923605698463),
 ('hack', 0.06330048710739114)]

In [212]:
def remove_token(bad_token, dataset):
    clean_dataset = []
    for comment in dataset:
        comment = ' '+comment+' '
        for token in bad_token:
            word = ' '+token+' '
            comment = comment.replace(word, ' ')
        
        clean_dataset.append(comment.strip())
        
    return np.array(clean_dataset)


In [213]:
NUM_BATCHES = 5
ig_tresh = [1e-2, 1e-3, 1e-4, 1e-100]
features = np.array(clean_comment)
labels = np.array(df.polarity)
train_eval, test_eval = [], []

for i in range(NUM_BATCHES):

    train_idx = np.load(f'dataset/train_{i+1}.npy')
    test_idx = np.load(f'dataset/test_{i+1}.npy')
    train_labels = labels[train_idx]
    test_labels = labels[test_idx]
    
    vectorizer = CountVectorizer()
    train_count_features = vectorizer.fit_transform(features[train_idx])
    test_count_features = vectorizer.transform(features[test_idx])

    ig_res = dict(zip(vectorizer.get_feature_names(),
                      mutual_info_classif(train_count_features, train_labels, discrete_features=True)
                   ))
    ig_res = sorted(ig_res.items(), key=lambda x: x[1], reverse=True)

    below_tresh = []
    for j in range(len(ig_tresh)):
        below_tresh.append([ig[0] for ig in ig_res if ig[1] < ig_tresh[j]])
    
        new_features = remove_token(below_tresh[j], features)

        tfidf_vectorizer = TfidfVectorizer()
        train_features = tfidf_vectorizer.fit_transform(new_features[train_idx])
        test_features = tfidf_vectorizer.transform(new_features[test_idx])

        # train and evaluate
        clf = MultinomialNB()
        clf.fit(train_features, train_labels)
        train_acc = clf.score(train_features, train_labels)
        test_acc = clf.score(test_features, test_labels)

        train_eval.append(train_acc)
        test_eval.append(test_acc)

print('Train and evaluate model completed')

Train and evaluate model completed


In [215]:
NUM_PARAMS = len(ig_tresh)
train_history = [[] for i in range(NUM_PARAMS)]
test_history = [[] for i in range(NUM_PARAMS)]

for i in range(len(train_eval)):
    idx = i % NUM_PARAMS
    train_history[idx].append(train_eval[i])
    test_history[idx].append(test_eval[i])

# append the average accuracy
for i in range(NUM_PARAMS):
    train_history[i].append(sum(train_history[i])/NUM_BATCHES)
    test_history[i].append(sum(test_history[i])/NUM_BATCHES)

if len(train_history[-1]) == NUM_BATCHES+1:
    print('Done')

Done


In [217]:
eval_history = {}
for i in range(NUM_PARAMS):
    eval_history[f'Train Acc (tresh={ig_tresh[i]})'] = train_history[i] 
    eval_history[f'Test Acc (tresh={ig_tresh[i]})'] = test_history[i]
    
history_ig = pd.DataFrame(eval_history, index=['Batch-1', 'Batch-2', 'Batch-3', 
                                                  'Batch-4', 'Batch-5', 'Average'])
history_ig

Unnamed: 0,Train Acc (tresh=0.01),Test Acc (tresh=0.01),Train Acc (tresh=0.001),Test Acc (tresh=0.001),Train Acc (tresh=0.0001),Test Acc (tresh=0.0001),Train Acc (tresh=1e-100),Test Acc (tresh=1e-100)
Batch-1,0.950413,0.677419,1.0,0.677419,1.0,0.709677,0.991736,0.709677
Batch-2,0.942149,0.870968,0.975207,0.806452,0.975207,0.774194,0.975207,0.774194
Batch-3,0.967213,0.766667,0.97541,0.733333,0.97541,0.7,0.97541,0.7
Batch-4,0.934426,0.733333,0.983607,0.733333,0.97541,0.766667,0.97541,0.733333
Batch-5,0.97541,0.466667,0.967213,0.533333,0.959016,0.466667,0.959016,0.533333
Average,0.953922,0.703011,0.980287,0.696774,0.977009,0.683441,0.975356,0.690108


If we compare with model that <b>not using information gain</b>, we get <b>72.24%</b> for test accuracy.
And with <b>using information gain</b> with the best parameter we get:
<h3>70.3%</h3>

> Recommend to not use INFORMATION GAIN

### Multinomial Naive Bayes +  Information Gain + TF-IDF

In [218]:
ngram = [(1, 1), (1, 2), (1, 3)]
min_df = [0.0, .1, .2, .3]
max_df = [.4, .5, .6, .7 , .8, .9, 1.0]
ig_tresh = [1e-2, 1e-3, 1e-4, 1e-100]

param_combinations = list(product(ngram, min_df, max_df, ig_tresh))
param_combinations[:3]

[((1, 1), 0.0, 0.4, 0.01),
 ((1, 1), 0.0, 0.4, 0.001),
 ((1, 1), 0.0, 0.4, 0.0001)]

In [219]:
NUM_BATCHES = 5
features = np.array(clean_comment)
labels = np.array(df.polarity)
train_eval, test_eval = [], []

for i in range(NUM_BATCHES):

    train_idx = np.load(f'dataset/train_{i+1}.npy')
    test_idx = np.load(f'dataset/test_{i+1}.npy')
    train_labels = labels[train_idx]
    test_labels = labels[test_idx]
    
    vectorizer = CountVectorizer()
    train_count_features = vectorizer.fit_transform(features[train_idx])
    test_count_features = vectorizer.transform(features[test_idx])

    ig_res = dict(zip(vectorizer.get_feature_names(),
                      mutual_info_classif(train_count_features, train_labels, discrete_features=True)
                   ))
    ig_res = sorted(ig_res.items(), key=lambda x: x[1], reverse=True)
    
    for param in param_combinations:
        below_tresh = [ig[0] for ig in ig_res if ig[1] < param[-1]]
        new_features = remove_token(below_tresh, features)

        tfidf_vectorizer = TfidfVectorizer(ngram_range=param[0], min_df=param[1], max_df=param[2])
        train_features = tfidf_vectorizer.fit_transform(new_features[train_idx])
        test_features = tfidf_vectorizer.transform(new_features[test_idx])

        # train and evaluate
        clf = MultinomialNB()
        clf.fit(train_features, train_labels)
        train_acc = clf.score(train_features, train_labels)
        test_acc = clf.score(test_features, test_labels)

        train_eval.append(train_acc)
        test_eval.append(test_acc)

print('Train and evaluate model completed')

Train and evaluate model completed


In [220]:
NUM_PARAMS = len(param_combinations)
train_history = [[] for i in range(NUM_PARAMS)]
test_history = [[] for i in range(NUM_PARAMS)]

for i in range(len(train_eval)):
    idx = i % NUM_PARAMS
    train_history[idx].append(train_eval[i])
    test_history[idx].append(test_eval[i])

# append the average accuracy
for i in range(NUM_PARAMS):
    train_history[i].append(sum(train_history[i])/NUM_BATCHES)
    test_history[i].append(sum(test_history[i])/NUM_BATCHES)

if len(train_history[-1]) == NUM_BATCHES+1:
    print('Done')

Done


In [223]:
eval_history = {}
for i in range(NUM_PARAMS):
    eval_history[f'Train Acc (ngram={param_combinations[i][0]}), min_df={param_combinations[i][1]}, max_df={param_combinations[i][2]}, tresh={param_combinations[i][-1]}'] = train_history[i] 
    eval_history[f'Test Acc (ngram={param_combinations[i][0]}), min_df={param_combinations[i][1]}, max_df={param_combinations[i][2]}, tresh={param_combinations[i][-1]}'] = test_history[i]
    , tresh={}
history_ig_tfidf = pd.DataFrame(eval_history, index=['Batch-1', 'Batch-2', 'Batch-3', 
                                                  'Batch-4', 'Batch-5', 'Average'])
history_ig_tfidf

Unnamed: 0,"Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.01","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.01","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.001","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.001","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.0001","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=0.0001","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=1e-100","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.4, tresh=1e-100","Train Acc (ngram=(1, 1)), min_df=0.0, max_df=0.5, tresh=0.01","Test Acc (ngram=(1, 1)), min_df=0.0, max_df=0.5, tresh=0.01",...,"Train Acc (ngram=(1, 3)), min_df=0.3, max_df=0.9, tresh=1e-100","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=0.9, tresh=1e-100","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.01","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.01","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.001","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.001","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.0001","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=0.0001","Train Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=1e-100","Test Acc (ngram=(1, 3)), min_df=0.3, max_df=1.0, tresh=1e-100"
Batch-1,0.950413,0.677419,1.0,0.677419,0.991736,0.709677,0.991736,0.709677,0.950413,0.677419,...,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484,0.834711,0.935484
Batch-2,0.942149,0.870968,0.975207,0.806452,0.975207,0.806452,0.975207,0.774194,0.942149,0.870968,...,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484,0.826446,0.935484
Batch-3,0.967213,0.766667,0.97541,0.733333,0.97541,0.7,0.97541,0.7,0.967213,0.766667,...,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333,0.860656,0.933333
Batch-4,0.934426,0.766667,0.983607,0.766667,0.97541,0.8,0.97541,0.8,0.934426,0.733333,...,0.868852,0.9,0.868852,0.9,0.868852,0.9,0.868852,0.9,0.868852,0.9
Batch-5,0.909836,0.533333,0.959016,0.433333,0.959016,0.466667,0.959016,0.466667,0.97541,0.466667,...,0.868852,0.6,0.868852,0.6,0.868852,0.6,0.868852,0.6,0.868852,0.6
Average,0.940807,0.723011,0.978648,0.683441,0.975356,0.696559,0.975356,0.690108,0.953922,0.703011,...,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086,0.851904,0.86086


In [224]:
test_col = [col for col in history_ig_tfidf.columns if col.startswith('Test')]
test_col[history_ig_tfidf.loc['Average', test_col].argmax()]


'Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=0.01'

In [225]:
history_ig_tfidf[['Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=0.01', 'Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=0.01']]

Unnamed: 0,"Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=0.01","Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=0.01"
Batch-1,0.834711,0.935484
Batch-2,0.826446,0.935484
Batch-3,0.860656,0.933333
Batch-4,0.868852,0.9
Batch-5,0.868852,0.6
Average,0.851904,0.86086


In [227]:
history_ig_tfidf[['Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=1e-100', 'Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=1e-100']]

Unnamed: 0,"Train Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=1e-100","Test Acc (ngram=(1, 1)), min_df=0.3, max_df=0.5, tresh=1e-100"
Batch-1,0.834711,0.935484
Batch-2,0.826446,0.935484
Batch-3,0.860656,0.933333
Batch-4,0.868852,0.9
Batch-5,0.868852,0.6
Average,0.851904,0.86086


We get same score for using and without using information gain that is:
<h3>86.01%</h3>

> Recommend to not use INFORMATION GAIN

### Multinomial Naive Bayes +  Chi Square