In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [53]:
# import pandas as pd

datar = pd.read_csv('data_anotated/indonesia_terserah.csv',encoding='latin1')
datar['Konten'] = datar['Konten'].str.lower()

print(datar['Konten'])

0      source: memefess\n#indonesiaterserah #indonesi...
1      #indonesiaterserah \n\nfeel sorry towards thei...
2      #terserahindonesia \n#indonesiaterserah \n#cov...
3      indonesian dept. store keeps lights off to app...
4      semangat banget nih mokat club https://t.co/th...
5      me hugging myself cause no one care about my f...
6      barely anyone do social distancing. i swear to...
7      another #indonesiaterserah video, now is the o...
8      indonesia keren ð\n\n#indonesiaterserah htt...
9      here i am, making a controversial yet bold sta...
10     video showing indonesian shoppers racing into ...
11     indonesia frontliners has give up to fight the...
12     welcome to indonesia :)))) #indonesiaterserah ...
13     left: indigenous farmer cut trees in their tra...
14     stay strong indonesia frontliners.\nwe malaysi...
15     ive been seeing about indonesiaâs frontliner...
16     saturday quiet\ndirumah aja yokksss dan stay p...
17     yowwww that abs look soð

In [60]:
import re # impor modul regular expression
import string

emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)



import re
for i in range(len(datar)):
    txt = datar.loc[i]["Konten"]
    txt = re.sub(r'@[A-Z0-9a-z_:]+','',txt)#replace username-tags
    txt = re.sub(r'^[RT]+','',txt)#replace RT-tags
    txt = re.sub(r'[^\w\s]', '', txt)
    txt = re.sub(r'\d+','', txt)
    txt = re.sub('https?://[A-Za-z0-9./]+','',txt)#replace URLs
    txt = re.sub("[^a-zA-Z]", " ",txt)#replace hashtags
    txt = emoji_pattern.sub(r'', txt)
    datar.at[i,"Konten"]=txt


print(datar['Konten'])

0      source  memefess  indonesiaterserah  indonesia...
1       indonesiaterserah   feel sorry towards their ...
2       terserahindonesia   indonesiaterserah   covid...
3      indonesian dept  store keeps lights off to app...
4                        semangat banget nih mokat club 
5      me hugging myself cause no one care about my f...
6      barely anyone do social distancing  i swear to...
7      another  indonesiaterserah video  now is the o...
8              indonesia keren        indonesiaterserah 
9      here i am  making a controversial yet bold sta...
10     video showing indonesian shoppers racing into ...
11     indonesia frontliners has give up to fight the...
12        welcome to indonesia        indonesiaterserah 
13     left  indigenous farmer cut trees in their tra...
14     stay strong indonesia frontliners  we malaysia...
15     ive been seeing about indonesia   s frontliner...
16     saturday quiet dirumah aja yokksss dan stay pr...
17     yowwww that abs look so 

In [61]:
import nltk
from nltk.tokenize import TweetTokenizer

def identify_tokens(row):
    description = row['Konten']
    tokens = nltk.word_tokenize(str(description))
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

datar['Word'] = datar.apply(identify_tokens, axis=1)

print(datar['Word'])

0      [source, memefess, indonesiaterserah, indonesi...
1      [indonesiaterserah, feel, sorry, towards, thei...
2      [terserahindonesia, indonesiaterserah, covid, ...
3      [indonesian, dept, store, keeps, lights, off, ...
4                   [semangat, banget, nih, mokat, club]
5      [me, hugging, myself, cause, no, one, care, ab...
6      [barely, anyone, do, social, distancing, i, sw...
7      [another, indonesiaterserah, video, now, is, t...
8                  [indonesia, keren, indonesiaterserah]
9      [here, i, am, making, a, controversial, yet, b...
10     [video, showing, indonesian, shoppers, racing,...
11     [indonesia, frontliners, has, give, up, to, fi...
12           [welcome, to, indonesia, indonesiaterserah]
13     [left, indigenous, farmer, cut, trees, in, the...
14     [stay, strong, indonesia, frontliners, we, mal...
15     [ive, been, seeing, about, indonesia, s, front...
16     [saturday, quiet, dirumah, aja, yokksss, dan, ...
17     [yowwww, that, abs, look

In [62]:
# # import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer

factory = StemmerFactory()
indostemming = factory.create_stemmer()

stemming = PorterStemmer()

def stem_list(row):
    my_list = row['Word']
    stemmed_list = [indostemming.stem(word) for word in my_list]
    return (stemmed_list)

datar['Stemmed_Word'] = datar.apply(stem_list, axis=1)

print(datar['Stemmed_Word'])

0      [source, memefess, indonesiaterserah, indonesi...
1      [indonesiaterserah, feel, sorry, towards, thei...
2      [terserahindonesia, indonesiaterserah, covid, ...
3      [indonesian, dept, store, keeps, lights, off, ...
4                   [semangat, banget, nih, mokat, club]
5      [me, hugging, myself, cause, no, one, care, ab...
6      [barely, anyone, do, social, distancing, i, sw...
7      [another, indonesiaterserah, video, now, is, t...
8                  [indonesia, keren, indonesiaterserah]
9      [here, i, am, making, a, controversial, yet, b...
10     [video, showing, indonesian, shoppers, racing,...
11     [indonesia, frontliners, has, give, up, to, fi...
12           [welcome, to, indonesia, indonesiaterserah]
13     [left, indigenous, farmer, cut, trees, in, the...
14     [stay, strong, indonesia, frontliners, we, mal...
15     [ive, been, seeing, about, indonesia, s, front...
16     [saturday, quiet, rumah, aja, yokksss, dan, st...
17     [yowwww, that, abs, look

In [63]:
from nltk.corpus import stopwords
stops = set(stopwords.words("indonesian"))                  

def remove_stops(row):
    my_list = row['Stemmed_Word']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

datar['Stem_Meaningful'] = datar.apply(remove_stops, axis=1)

print(datar['Stem_Meaningful'])


0      [source, memefess, indonesiaterserah, indonesi...
1      [indonesiaterserah, feel, sorry, towards, thei...
2      [terserahindonesia, indonesiaterserah, covid, ...
3      [indonesian, dept, store, keeps, lights, off, ...
4                   [semangat, banget, nih, mokat, club]
5      [me, hugging, myself, cause, no, one, care, ab...
6      [barely, anyone, do, social, distancing, i, sw...
7      [another, indonesiaterserah, video, now, is, t...
8                  [indonesia, keren, indonesiaterserah]
9      [here, i, am, making, a, controversial, yet, b...
10     [video, showing, indonesian, shoppers, racing,...
11     [indonesia, frontliners, has, give, up, to, fi...
12           [welcome, to, indonesia, indonesiaterserah]
13     [left, indigenous, farmer, cut, trees, in, the...
14     [stay, strong, indonesia, frontliners, we, mal...
15     [ive, been, seeing, about, indonesia, s, front...
16     [saturday, quiet, rumah, aja, yokksss, stay, p...
17     [yowwww, that, abs, look

In [64]:
def rejoin_words(row):
    my_list = row['Stem_Meaningful']
    joined_words = ( " ".join(my_list))
    return joined_words

datar['Processed'] = datar.apply(rejoin_words, axis=1)

print(datar['Processed'])

print(list(datar))

0      source memefess indonesiaterserah indonesiater...
1      indonesiaterserah feel sorry towards their fro...
2      terserahindonesia indonesiaterserah covid coro...
3      indonesian dept store keeps lights off to appe...
4                         semangat banget nih mokat club
5      me hugging myself cause no one care about my f...
6      barely anyone do social distancing i swear to ...
7      another indonesiaterserah video now is the ope...
8                      indonesia keren indonesiaterserah
9      here i am making a controversial yet bold stat...
10     video showing indonesian shoppers racing into ...
11     indonesia frontliners has give up to fight the...
12                welcome to indonesia indonesiaterserah
13     left indigenous farmer cut trees in their trad...
14     stay strong indonesia frontliners we malaysian...
15     ive been seeing about indonesia s frontliner w...
16     saturday quiet rumah aja yokksss stay producti...
17     yowwww that abs look so 

In [65]:
# cols_to_drop = ['Unnamed: 2','Unnamed: 3','Unnamed: 4','Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10',
#                 'Unnamed: 11','Unnamed: 12','Unnamed: 13','Unnamed: 14','Unnamed: 15','Unnamed: 16','Unnamed: 17','Unnamed: 18',
#                 'Unnamed: 19','Unnamed: 20','Unnamed: 21','Unnamed: 22','Unnamed: 23','Unnamed: 24','Unnamed: 25','Unnamed: 26',
#                 'Unnamed: 27','Unnamed: 28','Unnamed: 29','Unnamed: 30','Caption','Word','Stemmed_Word','Stem_Meaningful']
cols_to_drop =['Konten','Word','Stemmed_Word','Stem_Meaningful']

datar = datar.drop(cols_to_drop,axis='columns')


datar.to_csv('data_preprocessed/indonesia_terserah_processed.csv', index=False)

In [113]:
import pandas as pd

datas = pd.read_csv('data_preprocessed/indonesia_terserah_processed.csv')
print(datas)
print(datas.columns)

      Status                                          Processed
0    NEGATIF  source memefess indonesiaterserah indonesiater...
1    NEGATIF  indonesiaterserah feel sorry towards their fro...
2    NEGATIF  terserahindonesia indonesiaterserah covid coro...
3    NEGATIF  indonesian dept store keeps lights off to appe...
4    POSITIF                     semangat banget nih mokat club
5    NEGATIF  me hugging myself cause no one care about my f...
6    NEGATIF  barely anyone do social distancing i swear to ...
7    NEGATIF  another indonesiaterserah video now is the ope...
8    NEGATIF                  indonesia keren indonesiaterserah
9    NEGATIF  here i am making a controversial yet bold stat...
10   NEGATIF  video showing indonesian shoppers racing into ...
11   NEGATIF  indonesia frontliners has give up to fight the...
12   NEGATIF             welcome to indonesia indonesiaterserah
13   NEGATIF  left indigenous farmer cut trees in their trad...
14   NEGATIF  stay strong indonesia fron

In [114]:
from sklearn.model_selection import train_test_split

X=datas.Processed
y=datas.Status

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=225)

print('Banyak data x_train :',len(x_train))
print('Banyak data x_test  :',len(x_test))
print('Banyak data y_train :',len(y_train))
print('Banyak data y_test  :',len(y_test))

Banyak data x_train : 700
Banyak data x_test  : 300
Banyak data y_train : 700
Banyak data y_test  : 300


In [115]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer, HashingVectorizer

cvec=CountVectorizer()
tvec=TfidfVectorizer()
hvec=HashingVectorizer()

In [116]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier

clf1 = RandomForestClassifier()
clf2 = LogisticRegression()
clf3 = BernoulliNB()
clf4 = SVC()
clf = OneVsRestClassifier(svm.SVC(gamma=0.01, C=100., probability=True, class_weight='balanced', kernel='linear'))


In [119]:
from sklearn.pipeline import Pipeline

model= Pipeline([('vectorizer',tvec)
                 ,('classifier',clf)])


model.fit(x_train.values.astype('U'),y_train.values.astype('U'))

hasil=model.predict(x_test.values.astype('U'))



In [127]:
from sklearn.metrics import accuracy_score,confusion_matrix

cfm = confusion_matrix(hasil.astype(str),y_test.astype(str))

acc = accuracy_score(hasil.astype(str),y_test.astype(str))

print("Confusion Matrix :")
print(cfm)
print("Akurasi :")
print(acc)

Confusion Matrix :
[[269  19   1]
 [  7   2   0]
 [  2   0   0]]
Akurasi :
0.9033333333333333
