# Sentiment Analysis Using SVM-RBF with GA Optimization

### Import Library

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)


import re
import string 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


import pickle
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
sns.set(font_scale=1.3)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, label_binarize
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous
from sklearn_genetic.plots import plot_fitness_evolution, plot_search_space
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import warnings
warnings.filterwarnings('ignore')
np.random.seed(37)

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

### Input Dataset

In [None]:
data = pd.read_csv('dataset.csv', encoding = "ISO-8859-1", sep=';')
data.columns = ['Teks', 'Label']
data.Label = data["Label"].astype('category').cat.codes
data

0: Negatif,
1: Netral,
2: Positif

In [None]:
sns.factorplot(x="Label", data=data, kind="count", size=6, aspect=1.5, palette="PuBuGn_d")
plt.show()

### Pre-Processing

Case Folding

In [None]:
def casefolding(text):
    return text.lower()

In [None]:
data['Teks_Casefolding'] = data['Teks'].apply(casefolding)
data

Remove Number

In [None]:
def remove(text):
    text = re.sub('[0-9]+', '', text)
    return text
    
data['Remove_Number'] = data['Teks_Casefolding'].apply(lambda x: remove(x))
data

Tokenization

In [None]:
def tokenizing(text):
    return word_tokenize(text)

In [None]:
data['Teks_Tokenizing'] = data['Remove_Number'].apply(tokenizing)
data

Stopword Removal

In [None]:
stopwords = pd.read_csv('stopwordsID.csv', header=None)
extend = pd.DataFrame(["yg", "dg", "rt", "dgn", "ny", "d", 'klo', 'kalo', 'amp', 'biar',
                       'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 
                       'si', 'tau', 'tdk', 'tuh', 'utk', 'ya', 
                       'jd', 'jgn', 'sdh', 'aja', 'n', 't', 
                       'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                       '&amp', 'yah','my','rb','jr','rp','hr','di','kb','gb','kk', 'min', 'minn', 'pdhal', 
                       'kak', 'kip', 'wabarukatu', 'nila', 'swastiyastu',
                       'undiksha','yuk','yes', 'you',
                       'undiksa'])
stopwords = stopwords.append(extend, ignore_index=True)

list_stopwords = set(stopwords.iloc[:,0])

def stopwords_removing(words):
    return [word for word in words if word not in list_stopwords]

In [None]:
data['Teks_SR'] = data['Teks_Tokenizing'].apply(stopwords_removing)
data

Stemming

In [None]:
def stemming(words):
    stem_kalimat = []
    for k in words:
        stem_kata = stemmer.stem(k)
        stem_kalimat.append(stem_kata)

    stem_kalimat_str = ' '.join(stem_kalimat)
    return stem_kalimat_str

In [None]:
data['Teks_Stemming'] = data['Teks_SR'].apply(stemming)
data

### TF-IDF

In [None]:
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(list(data['Teks_Stemming']))
df_tfidf = pd.DataFrame(x.toarray(), columns=tfidf.get_feature_names())

In [None]:
df_tfidf

In [None]:
X = df_tfidf
y = data.Label

### Scatter Plot Dataset

In [None]:
pca = PCA(n_components = 3).fit(X)
X_pca = pca.transform(X)

In [None]:
sns.scatterplot(X_pca[:,0], X_pca[:, 1], hue=y, legend='full',palette="Set2")

### Wordcloud

In [None]:
df_tfidf.T.sum(axis=1)

wordcloud = WordCloud(background_color="white", width=3000, height=2000,).generate_from_frequencies(df_tfidf.T.sum(axis=1))
plt.imshow(wordcloud)

### Split Data Training & Testing

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=y)

### SVM-RBF with GA Classification

In [None]:
clf = SVC(kernel='rbf', probability=True)

param_grid = {'gamma': Continuous(0.01, 100),
              'C': Continuous(0.01, 100)}

cv = StratifiedKFold(n_splits=10, shuffle=True)
svm_rbf = GASearchCV(estimator=clf,
                               cv=cv,
                               scoring='accuracy',
                               population_size=250,
                               generations=30, 
                               tournament_size=3,
                               elitism=True,
                               crossover_probability=0.7,
                               mutation_probability=0.2,
                               param_grid=param_grid,
                               criteria='max',
                               algorithm='eaMuPlusLambda',
                               n_jobs=-1,
                               verbose=True,
                               keep_top_k=4)

In [None]:
svm_rbf = svm_rbf.fit(X_train,y_train)

In [None]:
y_predicy_ga = svm_rbf.predict(X_test)
accuracy_score(y_test,y_predicy_ga)

In [None]:
plot_fitness_evolution(svm_rbf)
plt.show()

In [None]:
print(svm_rbf.best_params_)

### Evaluation

Accuracy, Precission, Recall, F1-Score

In [None]:
print("Train report")
print(classification_report(y_train, svm_rbf.predict(X_train)))
print()
print("Test report")
print(classification_report(y_test, svm_rbf.predict(X_test)))

Confusion Matrix

In [None]:
plt.figure(figsize=(11, 5))

plt.subplot(121)
labels = y_train.unique()
cm = confusion_matrix(y_train, svm_rbf.predict(X_train), labels=labels)
sns.heatmap(cm, annot=True, square=True, cmap='Blues', cbar=False, xticklabels=labels, yticklabels=labels,
            fmt="d", annot_kws={"fontsize": 15})
plt.title(f'Train score: {svm_rbf.score(X_train, y_train):.3f}', fontsize=14)
plt.xlabel('Prediction', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.yticks(rotation=90, verticalalignment='center')

plt.subplot(122)
labels = y_test.unique()
cm = confusion_matrix(y_test, svm_rbf.predict(X_test), labels=labels)
sns.heatmap(cm, annot=True, square=True, cmap='Greens', cbar=False, xticklabels=labels, yticklabels=labels,
            fmt="d", annot_kws={"fontsize": 15})
plt.title(f'Test score: {svm_rbf.score(X_test, y_test):.3f}', fontsize=14)
plt.xlabel('Prediction', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.yticks(rotation=90, verticalalignment='center')

### Validasi

In [None]:
document = "tes"

In [None]:
def preprocess_data(data):
    
    data = casefolding(data)
    data = remove(data)
    data = tokenizing(data)
    data = stopwords_removing(data)
    data = stemming(data)

    data = tfidf.transform([data]).toarray()

    return data

In [None]:
preprocess_data(document)

In [None]:
predictions = svm_rbf.predict(preprocess_data(document))

if predictions == 2:
  print("Kalimat memiliki Sentimen Positif")
elif predictions == 0:
  print("Kalimat memiliki Sentimen Negatif")
else:
  print("Kalimat memiliki Sentimen Netral")
