In [None]:
# import library

import re
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import cufflinks as cf
import missingno as msno
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Config Setting Visualize

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use('ggplot')
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams["figure.facecolor"] = '#FFFBCD'
plt.rcParams["axes.facecolor"] = '#FFFFE0'
plt.rcParams["axes.grid"] = True
plt.rcParams["grid.color"] = orange_black[3]
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'

In [None]:
# load dataset

path = '../dataset/data.csv'
dataset = pd.read_csv(path, encoding='latin-1')
dataset.head()

## Melihat Statistik dan Dimensi Data

In [None]:
# Get Columns

print(f"Columns of data => {dataset.columns}")

In [None]:
# Columns and Raw

print(f"Shape of data => {dataset.shape}")

In [None]:
# Drop Columns

dataset = dataset.drop(['HS_Individual', 'HS_Group', 'HS_Religion',
                        'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other','HS_Weak',
                        'HS_Moderate', 'HS_Strong'], axis=1)

dataset.head()

In [None]:
# Amount of Data

neutral_count = dataset[(dataset['HS'] == 0) & (dataset['Abusive'] == 0)]
hate_abusive_count = dataset[(dataset['HS'] == 1) & (dataset['Abusive'] == 1)]
hate_speech_count = dataset[(dataset['HS'] == 1) & (dataset['Abusive'] == 0)]
abusive_count = dataset[(dataset['HS'] == 0) & (dataset['Abusive'] == 1)]

print(f"Neutral: {neutral_count.shape}")
print(f"Hate Speech and Abusive: {hate_abusive_count.shape}")
print(f"Hate Speech: {hate_speech_count.shape}")
print(f"Abusive: {abusive_count.shape}")

In [None]:
# Amount of Data visualization
colormap = plt.cm.plasma

temp = dataset.groupby(['HS', 'Abusive']).count()['Tweet'].reset_index().sort_values(by='Tweet',ascending=False)
temp.style.background_gradient(cmap=colormap)

## Handling Imbalanced Dataset

In [None]:
# cut neutral 0-1748 row

neutral = neutral_count[0:1748]
neutral.to_csv('../dataset/neutral.csv')

In [None]:
# # cut neutral 1748 row

# neutral1748 = neutral_count[1748:]
# neutral.to_csv('../data_test/neutral1748.csv')

In [None]:
# cut hate_speech 0-1748 row

hate_speech = hate_speech_count[0:1748]
hate_speech.to_csv('../dataset/hate_speech.csv')

In [None]:
# # cut hate_speech 0-1748 row

# hate_speech1748 = hate_speech_count[1748:]
# hate_speech1748.to_csv('../data_test/hate_speech1748.csv')

In [None]:
# cut abusive 0-1748 row

abusive = abusive_count[0:1748]
abusive.to_csv('../dataset/abusive.csv')

In [None]:
# # cut abusive 1748 row

# abusive1748 = abusive_count[1748:]
# abusive1748.to_csv('../data_test/abusive1748.csv')

In [None]:
# cut hate_nd_abusive 0-1748 row

hate_and_abusive = hate_abusive_count[0:1748]
hate_and_abusive.to_csv('../dataset/hate_and_abusive.csv')

In [None]:
# # cut hate_nd_abusive 1748 row

# hate_and_abusive1748 = hate_and_abusive[1748:]
# hate_and_abusive1748.to_csv('../data_test/hate_and_abusive1748.csv')

In [None]:
# append all data

dataset_balanced = neutral.append([hate_speech, abusive, hate_and_abusive])
dataset_balanced.reset_index(drop=True, inplace=True)
dataset_balanced

In [None]:
# X_testing = neutral1748.append([hate_speech1748, abusive1748, hate_and_abusive1748])
# X_testing.reset_index(drop=True, inplace=True)
# X_testing.to_csv('../data_test/X_testing.csv', index=False)
# X_testing

In [None]:
# Amount of Data visualization

temp_balanced = dataset_balanced.groupby(['HS', 'Abusive']).count()['Tweet'].reset_index().sort_values(by='Tweet',ascending=False)
temp_balanced.style.background_gradient(cmap=colormap)

## Menangani Missing Values dan Nilai Null

In [None]:
print("Null value :")
print(dataset.isna().sum())

## Unigram dan Bigram

In [None]:
# Character Length

dataset_balanced['Char_Length'] = dataset.Tweet.apply(lambda x: len(str(x)))
dataset_balanced

In [None]:
cf.go_offline()
cf.set_config_file(offline=True, world_readable=True)

dataset_balanced['Char_Length'].iplot(kind='hist',
                             bins=100,
                             xTitle='Char_Length',
                             linecolor='black',
                             yTitle='count',
                             title='Review Text Length Distribution')

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_word = get_top_n_words(dataset_balanced['Tweet'], 10)
common_words = get_top_n_bigram(dataset_balanced['Tweet'], 10)

In [None]:
# The distribution of top unigrams before removing stop words

df1 = pd.DataFrame(common_word, columns = ['Tweet' , 'count'])
df1.groupby('Tweet').sum()['count'].sort_values(ascending=False).iplot(kind='bar',
                                                                       yTitle='Count',
                                                                       linecolor='black',
                                                                       title='Top 10 Words in Review Before Removing Stopwords')

In [None]:
# The distribution of top trigrams before removing stop words

df2 = pd.DataFrame(common_words,
                   columns = ['Tweet' , 'count'])

df2.groupby('Tweet').sum()['count'].sort_values(ascending=False).iplot(kind='bar',
                                                                       yTitle='Count',
                                                                       linecolor='black',
                                                                       title='Top 10 Trigrams in Review Before Removing Stopwords')

In [None]:
dataset_balanced.head()

In [None]:
# save dataset

dataset_balanced.to_csv("../dataset/data_modelling/data_preparation.csv", index=False)