# Cyberbullying Classification by Type
The objective of this work is to create a multi-label classification model to classify the text according to a set of cyberbullying types - Sexual_Type1, Sexual_Type2, Physical_Appearance, Race, Intellectual, Religious and General Hate.

# Import Python Packages

In [204]:
import pandas as pd
import os
import re
import nltk

from appos.appos import appos_dict
from slangs.slangs import slangs_dict
from emoticons.emoticons import emoticons
from langdetect import detect
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import words
nltk.download('words') # Download the English words corpus from nltk
english_words = set(words.words())

nltk.download('stopwords')
stopwords_default=stopwords.words('english') # To import the common stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Albert\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Albert\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Preparation
## Data Extraction
To extract the data from the given .xlsx files

In [205]:
# Setting the folder path to the location of original dataset
folder_path = r"./Dataset/"

# Create an empty list for storing the created dataframe from original dataset
extracted_df = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    # Read only files with name ends with .xlsx
    if filename.endswith('.xlsx'):
        excel_file = os.path.join(folder_path, filename)
        xlsx = pd.ExcelFile(excel_file)
        sheet_names = xlsx.sheet_names # Read the sheet names contain inside the excel file
        dataframe = {}
        defined_column_name = ["Comment_Number",
                               "Commenter_Username",
                               "Comment",
                               "Comment_Post_Time",
                               "Overall_CB_Status",
                               "CB_Type",
                               "Sexual_Type1", "Unnamed_1", "Unnamed_2",
                               "Sexual_Type2", "Unnamed_3", "Unnamed_4",
                               "Physical_Appearance", "Unnamed_5", "Unnamed_6",
                               "Race", "Unnamed_7", "Unnamed_8",
                               "Intellectual", "Unnamed_9", "Unnamed_10",
                               "Religious", "Unnamed_11", "Unnamed_12",
                               "General_Hate", "Unnamed_13", "Unnamed_14",
                               "Purpose_of_CB",
                               "Insult", "Unnamed_15", "Unnamed_16",
                               "Defensive", "Unnamed_17", "Unnamed_18",
                               "Directionality", 
                               "Directed_Username", "Unnamed_19", "Unnamed_20",
                               "Other_Aspects",
                               "Depression", "Unnamed_21", "Unnamed_22",
                               "Suicides", "Unnamed_23", "Unnamed_24",
                               "Stress", "Unnamed_25", "Unnamed_26",
                               "Discrimination", "Unnamed_27", "Unnamed_28"]
        
        for sheet_name in sheet_names:
            # Retrive the dataframe for that specific sheet name
            dataframe[sheet_name] = pd.read_excel(excel_file, sheet_name)
            
            # Redefine the columns' name based on the defined_column_name
            try:
                dataframe[sheet_name].columns = defined_column_name
            except:
                print(f'{filename} : {sheet_name}.')
                
        
        for item in dataframe.values():
            # Drop the first two rows of the dataframe due to formatting issue
            item = item.drop(item.index[:2])
            
            # Drop unnecessary attributes/columns from the dataframe
            columns_to_drop = [col for col in item.columns if any(substring in col for substring in ["Comment_Number",
                                                                                                     "Unnamed",
                                                                                                     "Commenter_Username",
                                                                                                     "Comment_Post_Time",
                                                                                                     "CB_Type", 
                                                                                                     "Purpose_of_CB",
                                                                                                     "Insult",
                                                                                                     "Religious",
                                                                                                     "Defensive",
                                                                                                     "Other_Aspects",
                                                                                                     'Suicides',
                                                                                                     "Stress",
                                                                                                     "Depression",
                                                                                                     "Discrimination",
                                                                                                     "Directionality", 
                                                                                                    "Directed_Username"])]

            data = item.drop(columns=columns_to_drop)
            # Store the data to extracted_df
            extracted_df.append(data)

## Data Preprocessing
In this stage, we will perform the following tasks:
* Remove any rows that contains duplicate items and empty values for 'Comment' attribute
* Missing values in the attributes, any missing values will be replaced by 0. Two assumption are made:
    - All the cyberbullying texts in the provided original datasets have been correctly labelled as 1. Therefore, the remaining data with NA values will be labelled as 0 which is not a cyberbullying text.
    - All the cyberbullying texts in the provided original datasets have been correctly assigned to their respective cyberbullying type by labelling as 1. Therefore, the remaining cyberbullying type will be labelled as 0
* Check for error data
* Text formatting

### Removal of duplicate items and empty values

In [206]:
# Concatenate all the filtered dataframes into one
df = pd.concat(extracted_df, ignore_index=True)

# Removal of duplicates items in the 'Comment' column
df = df.drop_duplicates(subset='Comment')

# Removal of empty value in the 'Comment' column
df = df.dropna(subset='Comment')

### Replacing NA values for 'Overall_CB_Status' attribute and CB type's attributes
Two assumption are made:
* All the cyberbullying texts in the provided original datasets have been correctly labelled as 1. Therefore, the remaining data with NA values will be labelled as 0 which is not a cyberbullying text.
* All the cyberbullying texts in the provided original datasets have been correctly assigned to their respective cyberbullying type by labelling as 1. Therefore, the remaining cyberbullying type will be labelled as 0

In [207]:
# Check if NA value exists in the concatenated dataframe
check_na = df.isna()
max_retries = 2
retry_count = 0

# Replace NA value to 0 if exists in the dataframe
while (check_na == True).any().any() and retry_count < max_retries:
    print("NA values found in the DataFrame. Replacing NA values with 0......")            
    df = df.fillna(0)
    check_na = df.isna()
    retry_count += 1
          
if (check_na == True).any().any():
    print("Maximum retries reached. Some NA values could not be replaced.")
else:
    print("No NA values founds in the datasets.")

NA values found in the DataFrame. Replacing NA values with 0......
No NA values founds in the datasets.


### Check for Attribute's Data Type
According to the attributes' characteristics, all attributes should be in int (1 : True and 0 : False) with the exceptions for 'Comment' which should be in string/object type. 

In [208]:
# Check for dataframe's data types
df.dtypes

Comment                object
Overall_CB_Status       int64
Sexual_Type1           object
Sexual_Type2            int64
Physical_Appearance     int64
Race                   object
Intellectual            int64
General_Hate           object
dtype: object

Referring to the above output, attributes like Sexual_Type1, Race and General_Hate needs to be converted into int.

In [209]:
selected_column_to_convert = ['Sexual_Type1', 'Race', 'General_Hate']

# Performing data type conversion according to their attributes
for column_name in selected_column_to_convert:
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce').fillna(0)
    df[column_name] = df[column_name].astype('int64')

print(df.dtypes)

Comment                object
Overall_CB_Status       int64
Sexual_Type1            int64
Sexual_Type2            int64
Physical_Appearance     int64
Race                    int64
Intellectual            int64
General_Hate            int64
dtype: object


Referring to the above output, all attributes are now following the right data type after the conversion.

### Check for Error Data
Since all attributes with the exception of 'Comments' are in int (1 : True, 0 : False), and thus the min and max value within the attribute should be either 0 or 1 only.

Any error data will be removed from the dataframe.

In [210]:
df.describe()

Unnamed: 0,Overall_CB_Status,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
count,7942.0,7942.0,7942.0,7942.0,7942.0,7942.0,7942.0
mean,0.134097,0.028834,0.013473,0.019391,0.013725,0.019265,0.068371
std,0.341885,0.16735,0.115295,0.137902,0.116352,0.137462,0.252397
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,3.0,1.0,1.0,1.0,1.0,1.0,1.0


For 'Overall_CB_Status' attribute, it contains value larger than 1. Therefore, the respective rows with value greater than 1 should be removed from the dataset.

In [211]:
# Remove rows in Overall_CB_Status attriute where the value is not 0 or 1
df = df[df['Overall_CB_Status'].isin([0, 1])]
df.describe()

Unnamed: 0,Overall_CB_Status,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
count,7941.0,7941.0,7941.0,7941.0,7941.0,7941.0,7941.0
mean,0.133736,0.028838,0.013474,0.019393,0.013726,0.019267,0.068379
std,0.34039,0.167361,0.115302,0.137911,0.116359,0.137471,0.252412
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Text Formatting
For text formatting, we are using the following techniques:
* to convert the comments into lowercase
* to remove any tagged user id and hashtag info from the comments
* to remove any URL link in the comments
* to separate digit from the comments' words
* to remove digits from the comments
* to replace contractions found in the comments
* to replace any abbreviation words found in the comments
* to replace emoticons from the comments with words
* to remove punctuation
* to remove repeated character in a word with a max allowable repetition of 2 from the comments
* to remove single character word found in the comments
* to remove common stop words from the comments
* to remove any character that is not a letter, digit, or whitespace
* to remove additional white spaces
* to conduct spell checking
* to remove any rows that do not contain single word in the comments
* to remove any rows that do not contain any english words
* to conduct word lemmatization on the comment

In [212]:
# To convert the text into lowercase
def text_to_lower(text):
    if isinstance (text, str):
        lower_text = str(text.lower())
    else:
        lower_text = str(text)
    return lower_text

# To remove any tagged user id and hashtag info
def remove_words_start_with(text, starts_with_char):
    urls = re.finditer(starts_with_char + r'[A-Za-z0-9\w]*', text)
    for i in urls:
        text = re.sub(i.group().strip(), '', text)
    return text.strip()

# To remove any URL link
def remove_words_ending_with_com(text):
    # Define a regular expression pattern to match words ending with ".com"
    pattern = r'\b\w+\.com\b'
    # Use the re.sub function to replace all matches with an empty string
    text = re.sub(pattern, '', text)
    return text

# To separate digit from text
def separate_digit_text(text):
    regex_patter = re.compile(r'([\d]+)([a-zA-Z]+)')
    clean_text = regex_patter.sub(r'\1 \2', text)
    return clean_text

# To remove digits from the text
def removal_of_digits(text):
    regex_pattern = re.compile(r'[0-9]')
    text = regex_pattern.sub('', text)
    return text

# To replace contraction
def appos_look_up(text):
    words = text.split()
    new_text = []
    for word in words:
        word_s = word.lower()
        if word_s in appos_dict:
            new_text.append(appos_dict[word_s])
        else:
            new_text.append(word)
    apposed = " ".join(new_text)
    return apposed

# To replace any abbreviation words found in the text
def slang_look_up(text):
    words = text.split()
    new_text = []

    for word in words:
        word_s = word.lower()
        if word_s in slangs_dict:
            new_text.append(slangs_dict[word_s])
        else:
            new_text.append(word)
    slanged = " ".join(new_text)
    return slanged

# To replace emoticons with text
def emoticons_look_up(text):
    words = text.split()
    for word in words:
        if word in emoticons:
            text = text.replace(word, emoticons[word])
    return text

# To remove punctuations
def remove_punctuations(text):
    regex_pattern = re.compile(r'[\,+\:\?\!\"\(\)!\'\.\%\[\]]+')
    clean_text = regex_pattern.sub(r' ', text)
    clean_text = clean_text.replace('-', '')
    return clean_text

# To remove repeated character in a word with a max allowable repetition of 2
def remove_repeated_characters(text):
    """
    Remove repeated characters (>2) in words to max limit of 2
    Example: I am verrry happpyyy today => I am verry happyy today
    Args:
        text (str): text
    Returns:
        clean_text (str): cleaned text with removed repeated chars
    """
    regex_pattern = re.compile(r'(.)\1+')
    text = regex_pattern.sub(r'\1\1', text)
    return text

# To remove a single character word
def remove_single_char_word(text):
    words = text.split()
    filter_words = [word for word in words if len(word) > 1]
    return " ".join(filter_words)

# To remove common stop words
def remove_stop_words(text, stop_words=stopwords_default):
    stop_words = set(stopwords_default)
    split_list = text.split(" ")
    split_list = [word for word in split_list if word not in stop_words]
    return " ".join(split_list)

# To remove any character that is not a letter, digit, or whitespace
def removal_non_letter_digit_whitespaces(text):
    pattern = r'[^a-zA-Z0-9\s]'  
    text = re.sub(pattern, ' ', text)
    text = ' '.join(text.split())
    return text

# To remove additional white space
def remove_extra_space(text):
    clean_text = ' '.join(text.strip().split())
    return clean_text

def check_word_spelling(text):
    # Initialize the spell checker
    spell = SpellChecker()
    # Tokenize the text into words
    words = text.split()
    # Create a dictionary to store misspelled words and their corrected versions
    misspelled_dict = {}

    # Iterate through the words in the text
    for word in words:
        # Check if the word is misspelled
        if word in spell.unknown(words):
            # Get the one `most likely` correction
            corrected_word = spell.correction(word)
            # Store the misspelled word and its correction in the dictionary
            misspelled_dict[word] = corrected_word

    # Replace misspelled words with their corrected versions in the text
    for misspelled, corrected in misspelled_dict.items():
        if corrected is not None:
            text = text.replace(misspelled, corrected)

    return text

# To check if the rows consist any english words
def any_english_word_in_list(text):
    word_list = word_tokenize(text)
    english_words = set(words.words())
    return any(word.lower() in english_words for word in word_list)

In [213]:
# To create a copy of the df to text_formatting
text_formatting = df.copy()

In [214]:
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : text_to_lower(x))
starts_with_char_list = ['@', '#']
for char in starts_with_char_list:
    text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : remove_words_start_with(x,starts_with_char=char))

In [215]:
# List of functions to apply
functions_to_apply = [
    remove_words_ending_with_com,
    separate_digit_text,
    removal_of_digits,
    appos_look_up,
    slang_look_up,
    emoticons_look_up,
    remove_punctuations,
    remove_repeated_characters,
    remove_single_char_word,
    remove_stop_words,
    removal_non_letter_digit_whitespaces,
    remove_extra_space,
]

# Apply functions in a loop
for func in functions_to_apply:
    text_formatting['Comment'] = text_formatting['Comment'].apply(func)

In [216]:
# To remove any rows with empty value in the Comment attribute prior to conduct spell checking
text_formatting = text_formatting[text_formatting['Comment'] != '']
text_formatting['Comment'] = text_formatting['Comment'].apply(lambda x : check_word_spelling(x))


In [217]:
# To conduct word lemmatizer and save the result as a new column
lematizer=WordNetLemmatizer()

def lemmatizer_words(text):
    return " ".join([lematizer.lemmatize(word) for word in text.split()])

result = list(text_formatting['Comment'].apply(lambda x : lemmatizer_words(x)))

index_to_insert = 1
new_column_name = "Lemmatized_Comment"
final_df = text_formatting.copy()
final_df.insert(index_to_insert, new_column_name, result)

In [218]:
# Filter rows with at least one English word in the list
clean_df = final_df[final_df['Lemmatized_Comment'].apply(lambda x: any_english_word_in_list(x))]

In [221]:
column_to_drop = ['Comment', 'Overall_CB_Status']
final_df = clean_df.drop(columns=column_to_drop)
final_df

Unnamed: 0,Lemmatized_Comment,Overall_CB_Status,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
0,zany,0,0,0,0,0,0,0
1,larry,0,0,0,0,0,0,0
3,get gif cannot have phone,0,0,0,0,0,0,0
4,larry zany sexy niall liam something stupid back,0,0,0,0,0,0,0
7,pretty much,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
8495,daisy one,0,0,0,0,0,0,0
8496,damn really want one booty argentina,0,0,0,0,0,0,0
8497,want blue whale one,0,0,0,0,0,0,0
8498,puzzle,0,0,0,0,0,0,0


In [222]:
# Save the final_df into a .csv file
file_path = "./preprocess_data.csv"
final_df.to_csv(file_path,index=False)

# Data Balancing

In [298]:
file_path = "./preprocess_data.csv"
df =pd.read_csv(file_path)
df.head()

Unnamed: 0,Lemmatized_Comment,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
0,zany,0,0,0,0,0,0
1,larry,0,0,0,0,0,0
2,get gif cannot have phone,0,0,0,0,0,0
3,larry zany sexy niall liam something stupid back,0,0,0,0,0,0
4,pretty much,0,0,0,0,0,0
...,...,...,...,...,...,...,...
6324,daisy one,0,0,0,0,0,0
6325,damn really want one booty argentina,0,0,0,0,0,0
6326,want blue whale one,0,0,0,0,0,0
6327,puzzle,0,0,0,0,0,0


In [299]:
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']
for column in selected_columns_balancing:
    result = df[column].value_counts()
    print(result)

0    6104
1     225
Name: Sexual_Type1, dtype: int64
0    6223
1     106
Name: Sexual_Type2, dtype: int64
0    6177
1     152
Name: Physical_Appearance, dtype: int64
0    6224
1     105
Name: Race, dtype: int64
0    6178
1     151
Name: Intellectual, dtype: int64
0    5803
1     526
Name: General_Hate, dtype: int64


## Data Balancing Type 1

In [300]:
df_no_cb = df[(df['Sexual_Type1'] == 0) & 
              (df['Sexual_Type2'] == 0) & 
              (df['Physical_Appearance'] == 0) & 
              (df['Race'] == 0) & 
              (df['Intellectual'] == 0) &
              (df['General_Hate'] == 0)].reset_index(drop=True)
df_no_cb

Unnamed: 0,Lemmatized_Comment,Sexual_Type1,Sexual_Type2,Physical_Appearance,Race,Intellectual,General_Hate
0,zany,0,0,0,0,0,0
1,larry,0,0,0,0,0,0
2,get gif cannot have phone,0,0,0,0,0,0
3,larry zany sexy niall liam something stupid back,0,0,0,0,0,0
4,pretty much,0,0,0,0,0,0
...,...,...,...,...,...,...,...
5397,daisy one,0,0,0,0,0,0
5398,damn really want one booty argentina,0,0,0,0,0,0
5399,want blue whale one,0,0,0,0,0,0
5400,puzzle,0,0,0,0,0,0


In [307]:
sampled_df_no_cb = df_no_cb.sample(n=3000, random_state=42)

In [308]:
# Separate features and label
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']

balanced_df = sampled_df_no_cb.copy()
for column in selected_columns_balancing:
    X = df['Lemmatized_Comment']
    y = df[column]
    # Reshape X to a 2D array
    X = np.array(X).reshape(-1, 1)
    # Reshape y to a 2D array
    y = np.array(y).reshape(-1, 1)
    
    # Set the desired number of oversampled instances
    desired_samples = 600  # Change this to your desired number

    # Calculate the sampling strategy
    minority_class_count = np.sum(y == 1)
    sampling_strategy = {0: len(y) - minority_class_count, 1: desired_samples}

    # Initialize RandomOverSampler with the specified sampling strategy
    ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=42)
    
    # Perform oversampling
    X_resampled, y_resampled = ros.fit_resample(X, y)
    # Convert y_resampled back to a 1D array
    y_resampled = y_resampled.flatten()
    # Convert X_resampled back to a 1D array of comments
    X_resampled = X_resampled.flatten()

    # Create a new dictionary for oversampled data
    oversampled_data = {
        'Lemmatized_Comment': list(X_resampled),
        column: list(y_resampled),
    }

    oversampled_df = pd.DataFrame(oversampled_data)
    oversampled_df = oversampled_df[oversampled_df[column] == 1]
    balanced_df = pd.concat([balanced_df, oversampled_df], ignore_index=True)
    balanced_df = balanced_df.fillna(0)

In [309]:
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']
for column in selected_columns_balancing:
    balanced_df.drop_duplicates(subset='Lemmatized_Comment')
    result = balanced_df[column].value_counts()
    print(result)

0.0    6000
1.0     600
Name: Sexual_Type1, dtype: int64
0.0    6000
1.0     600
Name: Sexual_Type2, dtype: int64
0.0    6000
1.0     600
Name: Physical_Appearance, dtype: int64
0.0    6000
1.0     600
Name: Race, dtype: int64
0.0    6000
1.0     600
Name: Intellectual, dtype: int64
0.0    6000
1.0     600
Name: General_Hate, dtype: int64


In [310]:
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']

# Performing data type conversion according to their attributes
for column_name in selected_columns_balancing:
    balanced_df[column_name] = pd.to_numeric(balanced_df[column_name], errors='coerce').fillna(0)
    balanced_df[column_name] = balanced_df[column_name].astype('int64')
print(balanced_df.dtypes)

Lemmatized_Comment     object
Sexual_Type1            int64
Sexual_Type2            int64
Physical_Appearance     int64
Race                    int64
Intellectual            int64
General_Hate            int64
dtype: object


In [311]:
file_path = "./balanced_data_type1.csv"
balanced_df.to_csv(file_path)

## Data Balancing Type 2

In [312]:
# Separate features and label
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']

balanced_df = df.copy()
for column in selected_columns_balancing:
    X = df['Lemmatized_Comment']
    y = df[column]
    # Reshape X to a 2D array
    X = np.array(X).reshape(-1, 1)
    # Reshape y to a 2D array
    y = np.array(y).reshape(-1, 1)
    
    # Initialize RandomOverSampler
    ros = RandomOverSampler(random_state=42)

    # Perform oversampling
    X_resampled, y_resampled = ros.fit_resample(X, y)


    # Initialize RandomOverSampler
    ros = RandomOverSampler(random_state=42)
    
    # Perform oversampling
    X_resampled, y_resampled = ros.fit_resample(X, y)
    # Convert y_resampled back to a 1D array
    y_resampled = y_resampled.flatten()
    # Convert X_resampled back to a 1D array of comments
    X_resampled = X_resampled.flatten()

    # Create a new dictionary for oversampled data
    oversampled_data = {
        'Lemmatized_Comment': list(X_resampled),
        column: list(y_resampled),
    }

    oversampled_df = pd.DataFrame(oversampled_data)
    oversampled_df = oversampled_df[oversampled_df[column].index > 6328]
    balanced_df = pd.concat([balanced_df, oversampled_df], ignore_index=True)
    balanced_df = balanced_df.fillna(0)

In [314]:
selected_columns_balancing = ['Sexual_Type1', 'Sexual_Type2', 'Physical_Appearance', 'Race', 'Intellectual', 'General_Hate']
for column in selected_columns_balancing:
    balanced_df.drop_duplicates(subset='Lemmatized_Comment')
    result = balanced_df[column].value_counts()
    print(result)

0.0    35669
1.0     6104
Name: Sexual_Type1, dtype: int64
0.0    35550
1.0     6223
Name: Sexual_Type2, dtype: int64
0.0    35596
1.0     6177
Name: Physical_Appearance, dtype: int64
0.0    35549
1.0     6224
Name: Race, dtype: int64
0.0    35595
1.0     6178
Name: Intellectual, dtype: int64
0.0    35970
1.0     5803
Name: General_Hate, dtype: int64


In [315]:
file_path = "./balanced_data_type2.csv"
balanced_df.to_csv(file_path)