In [161]:
import pandas as pd
import re
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import emoji
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [162]:
df = pd.read_csv('train.csv', encoding='latin-1')
df.head()

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


In [163]:
df.shape

(13169, 13)

In [164]:
tdf = pd.read_csv('sahabatsaber-labeled.csv')
tdf.head()

Unnamed: 0.1,Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Gender
0,0,Jangan Nanti Hanya Tinggal Penyesalan..\n\nSud...,0,0,0,0,0,0,0
1,1,@ElfridaHutabar5 🤗🤗🤗..\nTetap kabarkan keadaan...,0,0,0,0,0,0,0
2,2,"@puspaswarna @Miduk17 Kerja yang bagus, harus ...",0,0,0,0,0,0,0
3,3,@Miduk17 Sebentar-sebentar..\nBukannya ada wal...,0,0,0,0,0,0,0
4,4,"@__Bungs HBD..\nPanjang umur, sehat dan hidup ...",0,0,0,0,0,0,0


In [165]:
tdf.shape

(6109, 9)

In [166]:
df[df['Tweet'] == np.nan]

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong


In [167]:
stopwords = pd.read_csv('stopwordbahasa.csv', header=None)
stopwords = stopwords.rename(columns={0: 'stopword'})
stopwords.head()

Unnamed: 0,stopword
0,ada
1,adalah
2,adanya
3,adapun
4,agak


In [168]:
kamusalay = pd.read_csv('new_kamusalay.csv', header=None, encoding='latin-1')
kamusalay = kamusalay.rename(columns={0: 'original', 1: 'replacement'})
kamusalay.head() 

Unnamed: 0,original,replacement
0,anakjakartaasikasik,anak jakarta asyik asyik
1,pakcikdahtua,pak cik sudah tua
2,pakcikmudalagi,pak cik muda lagi
3,t3tapjokowi,tetap jokowi
4,3x,tiga kali


In [169]:
df = df[df['Tweet'] != '']

In [170]:
df.drop(['Abusive', 'HS_Physical', 'HS_Other', 'HS_Weak', 'HS_Moderate', 'HS_Strong'], axis=1, inplace=True)
tdf.drop(['Unnamed: 0', 'Abusive'], axis=1, inplace=True)

In [171]:
def lowercase(text):
    return text.lower()

In [172]:
# def function to remove unnecessary char

def remove_unnecessary_char(text):
    text = re.sub('\n', ' ', text) # remove new line
    text = re.sub('rt', ' ', text) # remove every rt symbol
    text = re.sub('user', ' ', text) # remove every user substring appearance
    text = re.sub('((www\.[^\s]+) | (https?://[^\s]+) | (http?://[^\s]+))', ' ', text) # remove every url 
    text = re.sub('  +', ' ', text) # remove every extra space
    text = re.sub('^@\w+', ' ', text)
    return text

remove_unnecessary_char('@Andy_Art80 Katekumen susah g sih??')

'  80 Katekumen susah g sih??'

In [173]:
def remove_nonalphabetical(text):
    text = re.sub('[^a-zA-Z]+', ' ', text)
    return text

In [174]:
alay_dict = dict(zip(kamusalay['original'], kamusalay['replacement']))

def normalize_alay(text):
    return ' '.join([alay_dict[word] if word in alay_dict else word for word in text.split(' ')])

In [175]:
def remove_stopword(text):
    text = ' '.join(['' if word in stopwords['stopword'].values else word for word in text.split(' ')])
    text = re.sub('  +', ' ', text)
    text = text.strip()
    return text

In [176]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemming(text):
    return stemmer.stem(text)

In [177]:
def remove_emoticon_byte(text):
    text = text.replace("\\", " ")
    text = re.sub('x..', ' ', text)
    text = re.sub(' n ', ' ', text)
    text = re.sub('\\+', ' ', text)
    text = re.sub('  +', ' ', text)
    return text

In [178]:
def preprocess(text):
    text = lowercase(text)
    text = remove_unnecessary_char(text)
    text = remove_nonalphabetical(text)
    text = normalize_alay(text)
    text = remove_emoticon_byte(text)
    text = remove_stopword(text)
    text = stemming(text)
    text = text.strip()
    return text

In [179]:
df['Tweet'] = df['Tweet'].apply(preprocess)

In [180]:
corpus = df['Tweet'].values
word_list = [word for tweet in corpus for word in tweet.split(' ')]
word_count = Counter(word_list)
least_common_word_list = [word for word, freq in word_count.items() if freq < 6]
print(least_common_word_list)

def remove_least_occurence_word(text):
    text = ' '.join([word for word in text.split(' ') if word.lower() not in least_common_word_list])
    return text

df['Tweet'] = df['Tweet'].apply(remove_least_occurence_word)

['lacak', 'basic', 'ciga', 'ifla', 'cal', 'licew', 'dukuh', 'watch', 'aldnoah', 'zero', 'friendzone', 'ais', 'kepal', 'milo', 'horlicks', 'cendol', 'toping', 'doket', 'gerai', 'rozak', 'uaku', 'kembar', 'baby', 'romantika', 'senawang', 'koe', 'privatisasi', 'ironi', 'huru', 'hara', 'intelek', 'sahih', 'ahlun', 'nar', 'bek', 'gantle', 'alga', 'mnr', 'ketidakmampuan', 'bilah', 'hilir', 'udara', 'delay', 'gosong', 'hasyim', 'muzadi', 'sahur', 'ekarang', 'lemas', 'blued', 'gvid', 'aksara', 'guardian', 'training', 'flat', 'shoes', 'dijutekin', 'dpara', 'lady', 'lipat', 'sanji', 'thpelautjkt', 'aja', 'fabiano', 'farhat', 'abbas', 'law', 'masya', 'hasananton', 'kosterace', 'nurdinsudirman', 'karolingidot', 'rakercabsu', 'lokalisasi', 'gamelan', 'konang', 'nse', 'cangkul', 'jagung', 'esemka', 'sembahyang', 'ritus', 'menyembahhyang', 'tempurung', 'awaasss', 'kosakata', 'picing', 'nhn', 'ngeden', 'pinokio', 'silat', 'kho', 'liang', 'ie', 'shen', 'deskripsi', 'information', 'lowo', 'takabur', 'ce

In [181]:
df['Tweet'].value_counts()

                                                                                                                                       53
warga badui selesai ingat tradisi seba pendopo perintah kabupaten lebak lanjut be emu gubernur banten wahidin halim sabtu sebabaduy    16
tomohon pusat ajar agama kristen katolik khusus doa meditasi bukit kasih suci indah tenang tomohon international flower festival       12
tai                                                                                                                                     9
cacat                                                                                                                                   9
                                                                                                                                       ..
tolol anjing                                                                                                                            1
marah kak tar ubah monyet         

In [182]:
df = df[df['Tweet'] != '']

In [183]:
df[['Tweet', 'HS']].values

array([['cowok usaha perhati gue lantas remeh perhati gue kasih khusus cowok bego',
        1],
       ['telat tau edan sarap gue gaul', 0],
       ['kadang pikir percaya tuhan jatuh kali kali kadang tuhan tinggal orang tua rencana pisah kakak pilih kristen anak ter',
        0],
       ...,
       ['hati hati bisu bosan duh aa', 0],
       ['bom real mudah bom kubur ledak revolusi jiwa', 0],
       ['situ foto ya kutil onta', 1]], dtype=object)

In [184]:
train, test = train_test_split(df[df['HS'] == 1], test_size=0.2, random_state=42)

X_train = train.Tweet
X_test = test.Tweet
print(X_test)

4499                               ayo lengser kasih ampun
12390    hati organisasi paham larang bumi pancasila bubar
10046                                 ayo lengser sih joko
553                           maju gubernur sengsara rawan
7947                               cina angkat kaki istana
                               ...                        
3124     rezim sipil lihat pimpin rezim sok rakyat sung...
3125     orang biasa jilat malu hormat susilo bambang y...
8601     pilih anis sandi pilih sih nista agama pecah n...
164      berita saracen akan tim prabowo anies sandi me...
8688                                                 mulut
Name: Tweet, Length: 1111, dtype: object


In [185]:
clf = LogisticRegression(n_jobs=1, C=1e5)
# clf = RandomForestClassifier()
# clf = SGDClassifier()

labels = ['HS_Individual', 'HS_Group', 'HS_Religion', 'HS_Race', 'HS_Gender']
for label in labels:
    pipe = Pipeline([
        ('vect', TfidfVectorizer()),
        ('tf', TfidfTransformer()),
        ('clf', clf)
    ])

    pipe.fit(X_train, train[label])
    y_pred = pipe.predict(X_test)
    pred_df = pd.DataFrame({'Tweet': X_test, 'Label': y_pred})
    print(pred_df)

    print('Prediction of {} accuracy: {}'.format(label, accuracy_score(test[label], y_pred)))

                                                   Tweet  Label
4499                             ayo lengser kasih ampun      1
12390  hati organisasi paham larang bumi pancasila bubar      0
10046                               ayo lengser sih joko      1
553                         maju gubernur sengsara rawan      1
7947                             cina angkat kaki istana      0
...                                                  ...    ...
3124   rezim sipil lihat pimpin rezim sok rakyat sung...      1
3125   orang biasa jilat malu hormat susilo bambang y...      1
8601   pilih anis sandi pilih sih nista agama pecah n...      1
164    berita saracen akan tim prabowo anies sandi me...      0
8688                                               mulut      1

[1111 rows x 2 columns]
Prediction of HS_Individual accuracy: 0.7335733573357336


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

                                                   Tweet  Label
4499                             ayo lengser kasih ampun      0
12390  hati organisasi paham larang bumi pancasila bubar      1
10046                               ayo lengser sih joko      0
553                         maju gubernur sengsara rawan      0
7947                             cina angkat kaki istana      1
...                                                  ...    ...
3124   rezim sipil lihat pimpin rezim sok rakyat sung...      0
3125   orang biasa jilat malu hormat susilo bambang y...      0
8601   pilih anis sandi pilih sih nista agama pecah n...      0
164    berita saracen akan tim prabowo anies sandi me...      1
8688                                               mulut      0

[1111 rows x 2 columns]
Prediction of HS_Group accuracy: 0.7335733573357336
                                                   Tweet  Label
4499                             ayo lengser kasih ampun      0
12390  hati organisasi paha

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [186]:
tdf['Tweet'] = tdf['Tweet'].apply(preprocess)
tdf['Tweet'].value_counts()

# Remove emoji

tdf['Tweet'] = tdf['Tweet'].apply(lambda x: emoji.replace_emoji(x, ''))
tdf['Tweet']

# Remove unnecessary twitter existing substring and last broken substring

tdf['Tweet'] = tdf['Tweet'].str.replace('hyperte', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('transfer', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('protocol', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('over', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('secure', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('socket', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('layer', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('t co', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('yperte', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('srite', ' ')
tdf['Tweet'] = tdf['Tweet'].str.replace('ype e', ' ')

corpus = tdf['Tweet'].values
word_list = [word for tweet in corpus for word in tweet.split(' ')]
word_count = Counter(word_list)
least_common_word_list = [word for word, freq in word_count.items() if freq < 2]
print(least_common_word_list)
tdf['Tweet'] = tdf['Tweet'].apply(remove_least_occurence_word)

tdf['Tweet'] = tdf['Tweet'].str.strip()

['nphnukl', 'miduk', 'dptaslou', 'ctl', 'januard', 'pouchnya', 'liter', 'nopefcpi', 'oizprhg', 'wdaas', 'whaaa', 'kereeennn', 'banting', 'reply', 'charles', 'tidore', 'qfza', 'shmk', 'kicau', 'andang', 'auuh', 'cxq', 'zqvq', 'hivgadw', 'kytqyn', 'bervibes', 'gjbohg', 'vgc', 'syuliiittt', 'rbm', 'iyb', 'cekat', 'riweuhnya', 'rloap', 'anakbai', 'lntutdv', 'tttappiii', 'curha', 'uwaknya', 'ttfvhbn', 'jyov', 'yqbdc', 'swbu', 'arievrahman', 'sociogeeks', 'wish', 'khyqjg', 'yip', 'hto', 'ntusiha', 'cm', 'kabisanya', 'yzdf', 'wcy', 'fhlfwtpiki', 'uwgjy', 'aopfzzj', 'maritim', 'gfgkvnvit', 'pelihara', 'auk', 'qfpkbz', 'pingpong', 'dvktrdp', 'oeyc', 'rgeg', 'zmdnsbu', 'lemaaahhh', 'biawak', 'zosokufhek', 'linkmu', 'unz', 'blob', 'enembe', 'tum', 'kooperatif', 'apadaya', 'ciquita', 'ofuzovasq', 'konyaku', 'nutrijel', 'fhjtq', 'rinci', 'azyf', 'mjwga', 'oliviacath', 'teko', 'lubang', 'wnyckblnq', 'jelantik', 'doessense', 'pipa', 'deras', 'ilustrasi', 'oyeennn', 'aufaag', 'tslzg', 'aufag', 'kyut',

In [187]:
tdf = tdf[(tdf['Tweet'] != '')]
tdf = tdf.dropna()
tdf.shape

(5918, 7)

In [188]:
df[df['HS'] == 1]['HS_Gender'].value_counts()

0    5246
1     305
Name: HS_Gender, dtype: int64

In [189]:
df[df['HS'] == 1]['HS_Individual'].value_counts()

1    3566
0    1985
Name: HS_Individual, dtype: int64

In [190]:
df[df['HS'] == 1]['HS_Group'].value_counts()

0    3566
1    1985
Name: HS_Group, dtype: int64

In [191]:
# Predict for HS
new_first_tdf = tdf

X_train = df['Tweet']
X_test = tdf['Tweet']

pipe = Pipeline([
        ('vect', TfidfVectorizer()),
        ('tf', TfidfTransformer()),
        ('clf', clf)
    ])

pipe.fit(X_train, df['HS'].values)
y_pred = pipe.predict(X_test)

new_first_tdf['HS'] = y_pred


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [192]:
new_df = df[df['HS'] == 1]

In [193]:
# Labeling
X_train = new_df['Tweet']
X_test = new_first_tdf.loc[new_first_tdf['HS'] == 1, 'Tweet']

for label in labels:
    pipe = Pipeline([
        ('vect', TfidfVectorizer()),
        ('tf', TfidfTransformer()),
        ('clf', clf)
    ])

    pipe.fit(X_train, new_df[label])
    y_pred = pipe.predict(X_test)

    new_first_tdf.loc[new_first_tdf['HS'] == 1, label] = y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [194]:
new_first_tdf.to_csv('sahabatsaber-first-method.csv')

In [195]:
new_second_tdf = tdf
train_x = df['Tweet']
train_y = df['HS']
test_x = new_second_tdf['Tweet']

mlt = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tf', TfidfTransformer()),
    ('sgd', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None))
])
mlt.fit(train_x, train_y)
y_pred = mlt.predict(test_x)
new_second_tdf['HS'] = y_pred

In [196]:
X_train = new_df['Tweet']
y_train = new_df.drop(columns=['Tweet', 'HS'])
X_test = new_second_tdf.loc[new_second_tdf['HS'] == 1, 'Tweet'].values

mlt = Pipeline([
    ('vect', TfidfVectorizer()),
    ('tf', TfidfTransformer()),
    ('sgd', MultiOutputClassifier(SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)))
])
mlt.fit(X_train, y_train)
y_pred = mlt.predict(X_test)
y_pred = y_pred.T
new_second_tdf.loc[new_second_tdf['HS'] == 1, 'HS_Individual'] = y_pred[0]
new_second_tdf.loc[new_second_tdf['HS'] == 1, 'HS_Group'] = y_pred[1]
new_second_tdf.loc[new_second_tdf['HS'] == 1, 'HS_Religion'] = y_pred[2]
new_second_tdf.loc[new_second_tdf['HS'] == 1, 'HS_Race'] = y_pred[3]
new_second_tdf.loc[new_second_tdf['HS'] == 1, 'HS_Gender'] = y_pred[4]

In [197]:
tdf.to_csv('sahabatsaber-second-method.csv')