OUTLINE WORKSHOP NATURAL LANGUAGE PROCESSING

In [46]:
!pip install nlp.id



In [55]:
# Basic module
import pandas as pd
import numpy as np
import string 
import re
import warnings

# Preprocessing module
from nlp_id.stopword import StopWord
from nlp_id.lemmatizer import Lemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold

# Modelling module
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier

# Evaluation moodule 
from sklearn.metrics import classification_report

warnings.filterwarnings('ignore')


In [56]:
# Load data
data = pd.read_csv('data_klasifikasi.csv')
data.sample(10)

Unnamed: 0,komentar,kategori
18,ada yang nonjol auto klik,tidak cyberbullying
1,bangsat perek murahan kampungan sadar diri woi,cyberbullying
105,aduh bening sekali mba,tidak cyberbullying
51,aku tulis lagu pki di youtube malah ketemu kod...,cyberbullying
2,bencong kok ngaku ngaku perempuan dasar kunyuk...,cyberbullying
3,iri bilang babi,cyberbullying
37,aku jadi cowok yang dia pegang aku jijik sekal...,cyberbullying
92,ada yang mantul tapi bukan bola,tidak cyberbullying
75,amin sehat dan sejahtera selalu pak,tidak cyberbullying
61,aa nya gemes banget sehat selalu sultan andara,tidak cyberbullying


In [49]:
# Chech data balance 
data['kategori'].value_counts()

tidak cyberbullying    60
cyberbullying          48
Name: kategori, dtype: int64

In [57]:
# Data cleaning
stopword = StopWord()
lemmatizer =  Lemmatizer()

def preprocessing (data):
    for index in range(len(data)):
      #lower case
      data['komentar'][index] = [str(entry).lower() for entry in data['komentar'][index]]
      #remove punctuation
      data['komentar'][index] = "".join([char for char in data['komentar'][index] if char not in string.punctuation])
      #remove number 
      data['komentar'][index] = re.sub(r'\d+', '', data['komentar'][index])
      #remove stopwords
      data['komentar'][index] = stopword.remove_stopword(data['komentar'][index])
      #lemmatizing 
      data['komentar'][index] = lemmatizer.lemmatize(data['komentar'][index])

In [58]:
preprocessing(data)
data.head()

Unnamed: 0,komentar,kategori
0,bajing memek urus anjing,cyberbullying
1,bangsat perek murah kampung sadar woi,cyberbullying
2,bencong ngaku ngaku perempuan dasar kunyuk ban...,cyberbullying
3,iri bilang babi,cyberbullying
4,kampret bajing sombong,cyberbullying


In [59]:
#Label Encoding
data['KategoriID'] = np.where(data['kategori'] == 'cyberbullying',1,0)
data.head()

Unnamed: 0,komentar,kategori,KategoriID
0,bajing memek urus anjing,cyberbullying,1
1,bangsat perek murah kampung sadar woi,cyberbullying,1
2,bencong ngaku ngaku perempuan dasar kunyuk ban...,cyberbullying,1
3,iri bilang babi,cyberbullying,1
4,kampret bajing sombong,cyberbullying,1


In [45]:
#TF IDF

vect= TfidfVectorizer()
vect.fit(data['komentar'])
dtm_tf_idf = vect.transform(data['komentar'])
dtm_tf_idf.toarray()


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
# Get feature name

vect.get_feature_names()


['aa',
 'acara',
 'adele',
 'adem',
 'adi',
 'adik',
 'aduh',
 'ah',
 'akun',
 'allah',
 'amin',
 'andara',
 'angin',
 'anies',
 'anjing',
 'arti',
 'artis',
 'asu',
 'auto',
 'awokwok',
 'babi',
 'badai',
 'badan',
 'bagus',
 'baik',
 'bajing',
 'baju',
 'bakat',
 'balerick',
 'balmon',
 'banci',
 'bangat',
 'bangga',
 'bangsat',
 'bangun',
 'bapak',
 'bayangin',
 'belah',
 'beli',
 'bencong',
 'bening',
 'berani',
 'berkah',
 'berserta',
 'besar',
 'beti',
 'biar',
 'biasa',
 'biaya',
 'bicara',
 'bilang',
 'biru',
 'bobo',
 'bocah',
 'body',
 'bohong',
 'boker',
 'bola',
 'boneka',
 'bongkar',
 'brand',
 'brengsek',
 'btw',
 'bulat',
 'buruk',
 'cantik',
 'cari',
 'cewek',
 'channel',
 'cinta',
 'cowok',
 'da',
 'dada',
 'dangdut',
 'dasar',
 'daulat',
 'deh',
 'demo',
 'dengar',
 'dengarin',
 'desain',
 'diam',
 'didatengin',
 'dki',
 'doa',
 'enak',
 'endorse',
 'enek',
 'entertainment',
 'fasilitas',
 'fokus',
 'gagah',
 'gagal',
 'ganggu',
 'gara',
 'gatal',
 'gede',
 'geli',
 '

In [60]:
# Data frame
dtm_tf_idf= pd.DataFrame(dtm_tf_idf.toarray(), columns= vect.get_feature_names())
dtm_tf_idf


Unnamed: 0,aa,acara,adele,adem,adi,adik,aduh,ah,akun,allah,amin,andara,angin,anies,anjing,arti,artis,asu,auto,awokwok,babi,badai,badan,bagus,baik,bajing,baju,bakat,balerick,balmon,banci,bangat,bangga,bangsat,bangun,bapak,bayangin,belah,beli,bencong,...,status,suara,suka,sukses,sultan,susah,swt,syirik,tai,tari,tau,tayang,telanjang,teman,tengah,terima,tete,tetek,tidur,tim,tolol,tonggos,tonton,tri,triliun,tulis,uang,ubah,ucap,umur,undang,up,urus,vexana,video,wisuda,woi,yaa,youtube,zuma
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.488861,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.488861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.53202,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.324713,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.387027,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.249833,0.0,0.000000,0.0,0.0,0.0,0.345722,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.546527,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.544838,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
103,0.0,0.0,0.0,0.0,0.0,0.419859,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.494489,0.000000,0.0,0.0,0.0
104,0.0,0.0,0.0,0.0,0.0,0.000000,0.470917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.441824,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
105,0.0,0.0,0.0,0.0,0.0,0.000000,0.585714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
106,0.0,0.0,0.0,0.0,0.0,0.000000,0.393850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


In [61]:
# Train test split 

x = dtm_tf_idf
y = data['KategoriID']

In [63]:
# Splitting train and test 

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y,
    stratify=y,
    test_size= 0.2,
    random_state= 123
)

In [65]:
#Modeling (General)
DT = DecisionTreeClassifier()
DT.fit(x_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [67]:
pred = DT.predict(x_test)
pred

array([0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1])

In [77]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.69      0.92      0.79        12
           1       0.83      0.50      0.62        10

    accuracy                           0.73        22
   macro avg       0.76      0.71      0.71        22
weighted avg       0.75      0.73      0.71        22



In [78]:
# Modeling 2 (General)
RF = RandomForestClassifier()
RF.fit(x_train, y_train)
predRF = RF.predict(x_test)
print(classification_report(y_test, predRF))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80        12
           1       1.00      0.40      0.57        10

    accuracy                           0.73        22
   macro avg       0.83      0.70      0.69        22
weighted avg       0.82      0.73      0.70        22



In [79]:
# Model bennchmark 

DT = DecisionTreeClassifier()
RF = RandomForestClassifier()
GB = GradientBoostingClassifier()
AdaBoost = AdaBoostClassifier()

In [80]:
# List model 

listmodel = [DT, RF, GB, AdaBoost]
namamodel = ["Decision Tree", "Random Forest", "Gradient Boosting", "AdaBoost"]

In [95]:
# Komparasi Model

skfold = StratifiedKFold(n_splits=5)
acc_model = []
acc_std = []

for model in listmodel :
  hasil_akurasi = cross_val_score(model, x_train, y_train, cv = skfold)
  acc_model.append(hasil_akurasi.mean())
  acc_std.append(hasil_akurasi.std())

In [96]:
# Komparasi Model

pembanding = pd.DataFrame({
    'Nama Model' : namamodel,
    'Rata-Rata Akurasi' : acc_model,
    'Standar Deviasi Akurasi Model' : acc_std
})
pembanding

Unnamed: 0,Nama Model,Rata-Rata Akurasi,Standar Deviasi Akurasi Model
0,Decision Tree,0.675163,0.054118
1,Random Forest,0.709804,0.031373
2,Gradient Boosting,0.722222,0.070637
3,AdaBoost,0.698039,0.084655


In [97]:
#Test Model (Test Dataset)

DT = DecisionTreeClassifier()
DT.fit(x_train,y_train)
pred = DT.predict(x_test)
print()




In [98]:
# Model Pipeline

from sklearn.pipeline import Pipeline

model =  DecisionTreeClassifier()
tf_idf = TfidfVectorizer()

preprocessing(data)

modelpipeline = ([
                  ('tfidf', tf_idf),
                  ('model', model)
])

In [100]:
# Fit Pipeline

modelpipeline.fit (data['komentar'], data['kategori'])


AttributeError: ignored

In [94]:
modelpipeline.predict(['setan jijik kau'])


AttributeError: ignored