## <span style="color: black;"><b>Data Cleaning for EDA</b></span>


### Import

In [2]:
import pandas as pd

### Data from all_data.csv to merged

In [3]:
merged = pd.read_csv('data/all_data.csv')

In [4]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')

No of Columns : 46
No of Rows : 1999516


In [5]:
merged.isnull().sum()

id                                           0
comment_text                                 4
split                                        0
created_date                                 0
publication_id                               0
parent_id                               864807
article_id                                   0
rating                                       0
funny                                        0
wow                                          0
sad                                          0
likes                                        0
disagree                                     0
toxicity                                     0
severe_toxicity                              0
obscene                                      0
sexual_explicit                              0
identity_attack                              0
insult                                       0
threat                                       0
male                                   1551516
female       

### Data Cleaning

In [6]:
# Step 1 - Add new column toxic, toxicity >= 0.5 then toxic = 1 otherwise toxic = 0

merged['toxic'] = (merged['toxicity'] >= 0.5).astype(int)

In [7]:
merged['toxic'].unique()

array([0, 1])

In [8]:
merged['toxicity'].unique()

array([0.37313433, 0.60526316, 0.66666667, ..., 0.1       , 0.2       ,
       0.4       ])

In [9]:
# Step 2
# drop rows for below columns where values are NaNs

columns_to_check = ['asian', 'atheist', 'bisexual', 'black', 'buddhist', 'christian',
                    'female', 'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
                    'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
                    'muslim', 'other_disability', 'other_gender', 'other_race_or_ethnicity',
                    'other_religion', 'other_sexual_orientation', 'physical_disability',
                    'psychiatric_or_mental_illness', 'transgender', 'white']

# Drop rows where all specified columns have NaN values
merged.dropna(subset=columns_to_check, how='all', inplace=True)



In [10]:
# Step 3 - Drop columns parent_id, publication_id, article_id

columns_to_drop = ['parent_id', 'publication_id', 'article_id']
merged = merged.drop(columns=columns_to_drop)

In [11]:
# Step 4 - remove NaNs columns of comment_text

# comment_text                                2
# 2 rows having NaNs


merged = merged.dropna(subset=['comment_text'])


In [12]:
# Step 5 - create .csv file from merged dataset

merged.to_csv('data/merged_data.csv', index=False)

### Checking how the data is balanced

In [13]:
# Checking how the data is balanced
toxic_counts = merged['toxic'].value_counts()
print(toxic_counts)

toxic_percentages = ((toxic_counts / len(merged)) * 100).round(1)
print(toxic_percentages)

toxic
0    397204
1     50794
Name: count, dtype: int64
toxic
0    88.7
1    11.3
Name: count, dtype: float64


<span style="color: Black;"><b>Imbalanced data. Work on this after EDA</b></span>

### No of rows and columns after Data Cleaning

In [14]:
print(f'No of Columns : {len(merged.columns)}')
print(f'No of Rows : {len(merged)}')

No of Columns : 44
No of Rows : 447998
