#**Big Data Competition Statistics Explore 2022**

#**IMPLEMENTASI KLASIFIKASI MULTILABEL DENGAN SUPPORT VECTOR MACHINE DAN FASTEXT PADA TWEET YANG BERPOTENSI CYBERBULLYING**

* **Kode Tim :** BDC21

* **Anggota  :** Fadil Irsyad Muhammad<sup>1</sup>. Dina Lestari<sup>2</sup>. Atika Indah Mentari<sup>3</sup>.

* **Asal Universitas :** Universitas Gadjah Mada

####**Import Library**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import os
import re
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

from tqdm.auto import tqdm
from gensim.models import FastText
from string import punctuation

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, multilabel_confusion_matrix

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# inisialisasi bilangan random
np.random.seed(21)

###**Prepare Corpus**

In [3]:
df = pd.read_csv('/content/drive/Shareddrives/BDC21/data/Data.csv')

##**Language Modeling**

In [4]:
# train language model
sentences = [word_tokenize(tweet.lower()) for tweet in tqdm(df['tweet'])]
model = FastText(sentences, size = 32, window = 3, min_count = 1, workers = 4, iter = 10, sg = 0, hs = 0)

  0%|          | 0/10535 [00:00<?, ?it/s]

In [5]:
# load model
os.makedirs('model/fasttext/', exist_ok = True)
model.save('model/fasttext/tweet_cyberbullying.fasttext')
model_ft = FastText.load('model/fasttext/tweet_cyberbullying.fasttext')
w2v = model_ft.wv

##**Data Preprocessing**

In [6]:
# persiapan normalisasi
kamus_alay = pd.read_csv('https://raw.githubusercontent.com/nasalsabila/kamus-alay/master/colloquial-indonesian-lexicon.csv')
kamus_alay = kamus_alay.filter(['slang', 'formal'], axis = 1).drop_duplicates(subset = ['slang'])
kamus_alay_map = dict(zip(kamus_alay['slang'], kamus_alay['formal']))
# mendefinisikan stopwords
extra_stopwords = [
    "user'", 'user', 'rt', 'amp', 'sih', 'deh', 'gue', 'pas', 'sok', 'nya', 'mas', 'kalo', 'dll', 'lo', 'kak',
    'nih', 'wkwk', 'wkwkwk', 'haha', 'loh', 'iya', 'mah', 'hahaha', 'bro', 'url', 'aja', 'lu', 'bang', 'ya'
    ]
list_stopwords = set(stopwords.words('indonesian') + extra_stopwords + list(punctuation))
def preprocessing(text):
  # menghapus new line
  text = re.sub('\n', ' ', text)
  # menghapus angka pada teks
  text = re.sub('\d+', ' ', text)
  # menghapus whitespace leading & trailing
  text = re.sub('\s+', ' ', text)
  # menghapus multiple whitespace
  text = re.sub(r'\b[a-zA-Z]\b', ' ', text) 
  # normalisasi
  text = ' '.join([kamus_alay_map[word.lower()] if word in kamus_alay_map else word for word in text.split(' ')])
  return text
df['tweet_preprocessed'] = df['tweet'].apply(preprocessing)

In [7]:
# tokenisasi & padding
def norm_sent_vector(sentence, w2v_model, stopwords):
  vecs = [w2v_model[word.lower()] for word in word_tokenize(sentence) if word not in list_stopwords]
  norm_vecs = [vec / np.linalg.norm(vec) for vec in vecs if np.linalg.norm(vec) > 0]
  sent_vec = np.mean(norm_vecs, axis = 0)
  return sent_vec
vecs = np.array([norm_sent_vector(sentence, w2v, list_stopwords) for sentence in df['tweet_preprocessed']])

In [8]:
# splitting dataset
cols_target = list(df.columns)[2:8]
tweet = vecs
labels = df[cols_target]
X_train, X_test, y_train, y_test = train_test_split(tweet, labels, test_size = 0.3)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7374, 32), (3161, 32), (7374, 6), (3161, 6))

##**Modeling**

In [9]:
model = LinearSVC(C = 10)
pred = pd.DataFrame()
for label in cols_target:
  print('target    : {}'.format(label))
  y_loop = y_train[label]
  model.fit(X_train, y_loop)
  y_pred = model.predict(X_test)
  print('accuracy  : {}'.format(round(accuracy_score(y_test[label], y_pred), 2)))
  print('precision : {}'.format(round(precision_score(y_test[label], y_pred), 2)))
  print('recall    : {}'.format(round(recall_score(y_test[label], y_pred), 2)))
  print('f1_Score  : {}'.format(round(f1_score(y_test[label], y_pred), 2)), '\n')
  pred[label] = y_pred

target    : individual
accuracy  : 0.77
precision : 0.62
recall    : 0.27
f1_Score  : 0.38 

target    : group
accuracy  : 0.86
precision : 0.65
recall    : 0.1
f1_Score  : 0.17 

target    : gender
accuracy  : 0.98
precision : 0.0
recall    : 0.0
f1_Score  : 0.0 

target    : physical
accuracy  : 0.97
precision : 0.0
recall    : 0.0
f1_Score  : 0.0 

target    : race
accuracy  : 0.97
precision : 0.69
recall    : 0.18
f1_Score  : 0.28 

target    : religion
accuracy  : 0.94
precision : 0.79
recall    : 0.12
f1_Score  : 0.21 



In [10]:
multilabel_confusion_matrix(y_test, pred)

array([[[2196,  139],
        [ 601,  225]],

       [[2688,   24],
        [ 405,   44]],

       [[3093,    0],
        [  68,    0]],

       [[3080,    0],
        [  81,    0]],

       [[3039,    9],
        [  93,   20]],

       [[2963,    6],
        [ 169,   23]]])