# Cyberbullying Classification by Type
The objective of this work is to create a multi-label classification model to classify the text according to a set of cyberbullying types - Sexual_Type1, Sexual_Type2, Physical_Appearance, Race, Intellectual, Religious, General Hate, and Neutral.

## Import Python Packages

In [1]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stopwords_default=stopwords.words('english')

from appos.appos import appos_dict
from slangs.slangs import slangs_dict
from emoticons.emoticons import emoticons

from langdetect import detect
from translate import Translator
from spellchecker import SpellChecker


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Albert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Data Extraction
The process to extract data from a given .csv files.

In [2]:
# Setting the folder path to the location of original dataset
folder_path = r"./Dataset/"

# Create an empty list for storing the created dataframe from original dataset
filtered_dfs = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Read only files with name ends with .xlsx
    if filename.endswith('.xlsx'):
        excel_file = os.path.join(folder_path, filename)
        xlsx = pd.ExcelFile(excel_file)
        sheet_names = xlsx.sheet_names # Read the sheet names contain inside the excel file
        dataframe = {}
        defined_column_name = ["Comment_Number",
                               "Commenter_Username",
                               "Comment",
                               "Comment_Post_Time",
                               "Overall_CB_Status",
                               "CB_Type",
                               "Sexual_Type1", "Unnamed_1", "Unnamed_2",
                               "Sexual_Type2", "Unnamed_3", "Unnamed_4",
                               "Physical_Appearance", "Unnamed_5", "Unnamed_6",
                               "Race", "Unnamed_7", "Unnamed_8",
                               "Intellectual", "Unnamed_9", "Unnamed_10",
                               "Religious", "Unnamed_11", "Unnamed_12",
                               "General_Hate", "Unnamed_13", "Unnamed_14",
                               "Purpose_of_CB",
                               "Insult", "Unnamed_15", "Unnamed_16",
                               "Defensive", "Unnamed_17", "Unnamed_18",
                               "Directionality", 
                               "Directed_Username", "Unnamed_19", "Unnamed_20",
                               "Other_Aspects",
                               "Depression", "Unnamed_21", "Unnamed_22",
                               "Suicides", "Unnamed_23", "Unnamed_24",
                               "Stress", "Unnamed_25", "Unnamed_26",
                               "Discrimination", "Unnamed_27", "Unnamed_28"]
        
        for sheet_name in sheet_names:
            # Retrive the dataframe for that specific sheet name
            dataframe[sheet_name] = pd.read_excel(excel_file, sheet_name)
            
            # Redefine the columns' name based on the defined_column_name
            try:
                dataframe[sheet_name].columns = defined_column_name
            except:
                print(f'{filename} : {sheet_name}.')
                
        
        for item in dataframe.values():
            # Drop the first two rows of the dataframe due to formatting issue
            item = item.drop(item.index[:2])
            
            # Drop unnecessary attributes/columns from the dataframe
            columns_to_drop = [col for col in item.columns if any(substring in col for substring in ["Comment_Number",
                                                                                                     "Unnamed",
                                                                                                     "Commenter_Username",
                                                                                                     "Comment_Post_Time",
                                                                                                     "CB_Type", 
                                                                                                     "Purpose_of_CB",
                                                                                                     "Insult",
                                                                                                     "Religious",
                                                                                                     "Defensive",
                                                                                                     "Other_Aspects",
                                                                                                     'Suicides',
                                                                                                     "Stress",
                                                                                                     "Depression",
                                                                                                     "Discrimination",
                                                                                                     "Directionality", 
                                                                                                    "Directed_Username"])]

            filter_df = item.drop(columns=columns_to_drop)
            # Store the dataframe to filtered_dfs
            filtered_dfs.append(filter_df)

## Data Preprocessing
In this stage, we are checking for the followings:
* Remove any duplicates items in the 'Comment' columns
* Missing values in the attributes, any missing values will be replaced by 0
* Check for dataframe data types and perform data type conversion
* Check for error data
* Text formatting

### Removal of Duplicates Data for 'Comments' attribute

In [3]:
# Concatenate all the filtered dataframes into one
df = pd.concat(filtered_dfs, ignore_index=True)

# Removal of duplicates items in the 'Comment' column
df = df.drop_duplicates(subset='Comment')

### Replacing NA values with 0

An assumption is made that the all the cyberbullying texts in the provided original datasets have been correctly labelled as 1. Therefore, the remaining data with NA values will be labelled as 0 which is not a cyberbullying text.

In [4]:
# Check NA value exists in the concatenated dataframe
check_na = df.isna()
max_retries = 2
retry_count = 0

# Replace NA value to 0 if presents in the dataframe
while (check_na == True).any().any() and retry_count < max_retries:
    print("NA values found in the DataFrame. Replacing NA values with 0......")            
    df = df.fillna(0)
    check_na = df.isna()
    retry_count += 1
          
if (check_na == True).any().any():
    print("Maximum retries reached. Some NA values could not be replaced.")
else:
    print("No NA values founds in the datasets.")

NA values found in the DataFrame. Replacing NA values with 0......
No NA values founds in the datasets.


### Check for Attributes' Data Type
According to the attributes' characteristics, all attributes should be in int (1 : True and 0 : False) with the exceptions for 'Comment' which should be in string/object type. 

In [5]:
# Check for dataframe's data types
df.dtypes

Comment                object
Overall_CB_Status       int64
Sexual_Type1           object
Sexual_Type2            int64
Physical_Appearance     int64
Race                   object
Intellectual            int64
General_Hate           object
dtype: object

In [6]:
selected_column = ['Sexual_Type1', 'Race', 'General_Hate']

# Performing data type conversion according to their attributes
for column_name in selected_column:
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce').fillna(0)
    df[column_name] = df[column_name].astype('int64')

print(df.dtypes)

Comment                object
Overall_CB_Status       int64
Sexual_Type1            int64
Sexual_Type2            int64
Physical_Appearance     int64
Race                    int64
Intellectual            int64
General_Hate            int64
dtype: object


### Check for Error Data
Since all attributes with the exception of 'Comments' are in int (1 : True, 0 : False), and thus the min and max value within the attribute should be either 0 or 1 only.

Any error data will be removed from the dataframe.

In [7]:
df.describe()

Unnamed: 0,Overall_CB_Status,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
count,7943.0,7943.0,7943.0,7943.0,7943.0,7943.0,7943.0
mean,0.13408,0.02883,0.013471,0.019388,0.013723,0.019262,0.068362
std,0.341867,0.16734,0.115287,0.137894,0.116345,0.137454,0.252382
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,1.0,1.0,1.0,1.0,1.0,1.0


For 'Overall_CB_Status' attribute, it contains value larger than 1. Therefore, the respective rows with value greater than 1 should be removed from the dataset.

In [8]:
# Print rows where one of the attributes is equal to 3
df = df[(df['Overall_CB_Status'] <= 1)]
df.describe()

Unnamed: 0,Overall_CB_Status,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
count,7942.0,7942.0,7942.0,7942.0,7942.0,7942.0,7942.0
mean,0.133719,0.028834,0.013473,0.019391,0.013725,0.019265,0.068371
std,0.340372,0.16735,0.115295,0.137902,0.116352,0.137462,0.252397
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Text Cleaning

In [9]:
text_formatting = df.copy()

In [10]:
def text_to_lower(text):
    if isinstance (text, str):
        lower_text = str(text.lower())
    else:
        lower_text = str(text)
    return lower_text

def appos_look_up(text):
    """
    Convert apostrophes word to original form
    Example: I don't know what is going on?  => I do not know what is going on? 
    Args:
        text (str): text 

    Returns:
        apposed (str) : text with converted apostrophes
    """
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed

def remove_words_start_with(text, starts_with_char):
    """
    Remove words start with character `starts_with_char`
    Example: dhoni rocks with last ball six #dhoni #six => dhoni rocks with last ball six (start_char_with='#')
    Args:
        text (str): text
        starts_with_char (str): starting characters of word, which to be removed from text

    Returns:
        text (str): text with removed words start with given chars
    """
    urls = re.finditer(starts_with_char + r'[A-Za-z0-9\w]*', text)
    for i in urls:
        text = re.sub(i.group().strip(), '', text)
    return text.strip()

def separate_digit_text(text):
    """
    Separate digit and words with space in text
    Example: I will be booking tickets for 2adults => I will be booking tickets for 2 adults   
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with separated digits and words
    """
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text

def slang_look_up(text):
    """
    Replace slang word in text to their original form
    Example: hi, thanq so mch => hi, thank you so much
    Args:
        text (str): text
    Returns:
        slanged (str): cleaned text with replaced slang
    """
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in slangs_dict:
            new_text.append(slangs_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

def remove_punctuations(text):
    """
    Removed special characters from text
    Example: he: I am going. are you coming? => he I am going. are you coming
   
    Args:
        text (str): text
   
    Returns:
        clean_text (str): cleaned text with removed special characters
    """
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\'\.\%\[\]]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

def remove_extra_space(text):
    """
    Remove extra white spaces space from text
    Example: hey are   you coming. ? => he are you coming. ?
    Args:
        text (str): text
    Returns:
        clean_text (str): clean text with removed extra white spaces
    """
    clean_text = ' '.join(text.strip().split())
    return clean_text

def emoticons_look_up(text):
    """
    Remove emoticons from text and returns list of emotions present in text
    Example: Sure, you are welcome :) => Sure, you are welcome.
    Args:
        text (str): text
    Returns:
        text (str): text with removed emoticons sign
        emolist (list) : list of emotions from text
    """

    words = text.split()
    for word in words:
        if word in emoticons:
            text = text.replace(word, emoticons[word])
    return text

def removal_non_letter_digit_whitespaces(text):
    pattern = r'[^a-zA-Z0-9\s]'  # Matches any character that is not a letter, digit, or whitespace
    text = re.sub(pattern, ' ', text)
    text = ' '.join(text.split())
    return text

def remove_single_char_word(text):
    """
    Remove single character word from text
    Example: I am in a home for 2 years => am in home for years 
    Args:
        text (str): text
         
    Returns:
        (str): text with single char removed
    """
    words = text.split()
    filter_words = [word for word in words if len(word) > 1]
    return " ".join(filter_words)

def remove_repeated_characters(text):
    """
    Remove repeated characters (>2) in words to max limit of 2
    Example: I am verrry happpyyy today => I am verry happyy today
    Args:
        text (str): text

    Returns:
        clean_text (str): cleaned text with removed repeated chars
    """
    regex_pattern = re.compile(r'(.)\1+')
    text = regex_pattern.sub(r'\1\1', text)
    return text

def removal_of_digits(text):
    """
    Replace digits to `replace_char`
    Example: I will be there on 22 april. => I will be there on dd april.
    Args:
        text (str): text
        replace_char (str): character with which digit has to be replaced
    Returns:
        clean_text (str): clean text with replaced char for digits
    """
    regex_pattern = re.compile(r'[0-9]')
    text = regex_pattern.sub('', text)
    return text

def remove_stop_words(text, stop_words=stopwords_default):
    """
    This function removes stop words from text
    Example: I am very excited for today's football match => very excited today's football match
    Params
        text (str) :text on which processing needs to done
        stop_words (list) : stop words which needs to be removed
    Returns
        text(str): text after stop words removal
    """

    stop_words = set(stopwords_default)
    split_list = text.split(" ")
    split_list = [word for word in split_list if word not in stop_words]
    return " ".join(split_list)

def auto_translate_to_english(text):
    if any(char.isalpha() for char in text): 
        # Detect the language of the input text
        detected_lang = detect(text)
        # Check if the detected language is already English
        if detected_lang == 'en':
            return text  # No need to translate if it's already English
        else:
            # Initialize the Google Translator
            translator = Translator(to_lang='en')

            # Translate the text to English
            translated_text = translator.translate(text)

            return translated_text


def check_word_spelling(text):
    # Initialize the spell checker
    spell = SpellChecker()
    # Tokenize the text into words
    words = text.split()
    # Create a dictionary to store misspelled words and their corrected versions
    misspelled_dict = {}

    # Iterate through the words in the text
    for word in words:
        # Check if the word is misspelled
        if word in spell.unknown(words):
            # Get the one `most likely` correction
            corrected_word = spell.correction(word)
            # Store the misspelled word and its correction in the dictionary
            misspelled_dict[word] = corrected_word

    # Replace misspelled words with their corrected versions in the text
    for misspelled, corrected in misspelled_dict.items():
        if corrected is not None:
            text = text.replace(misspelled, corrected)

    return text

def remove_words_ending_with_com(text):
    # Define a regular expression pattern to match words ending with ".com"
    pattern = r'\b\w+\.com\b'
    # Use the re.sub function to replace all matches with an empty string
    text = re.sub(pattern, '', text)
    return text

# Define a function to check if a string contains alphabets or numbers
def contains_alphabets_or_numbers(text):
    return any(char.isalpha() or char.isdigit() for char in text)

In [13]:
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : text_to_lower(x))

starts_with_char_list = ['@', '#']
for char in starts_with_char_list:
    text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_words_start_with(x,starts_with_char=char))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_words_ending_with_com(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : appos_look_up(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : separate_digit_text(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : slang_look_up(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : emoticons_look_up(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_punctuations(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : removal_non_letter_digit_whitespaces(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_single_char_word(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_repeated_characters(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : removal_of_digits(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_stop_words(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_extra_space(x))
text_formatting = text_formatting[text_formatting['Comment'].apply(contains_alphabets_or_numbers)]
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : auto_translate_to_english(x))
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : check_word_spelling(x))

In [14]:
file_path = './preprocess_data_v0.csv'
text_formatting.to_csv(file_path, index=False)

NameError: name 'text_formating' is not defined