# Data Cleaning

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn

In [2]:
#data=pd.read_csv("https://dataverse.lib.virginia.edu/api/access/datafile/1530", sep="\t")
#https://github.com/allisonhorst/palmerpenguins
data=sns.load_dataset("penguins")

In [3]:
data.head(2)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female


## Checking zero variance columns

In [4]:
def count_unique_values(data):
    
    unique_values=pd.DataFrame(data=data.nunique(), columns=["Number of unique values"])
    unique_values.sort_values(by=["Number of unique values"], ascending=False, inplace=True)
    
    return unique_values

In [5]:
unique_values=count_unique_values(data)
unique_values

Unnamed: 0,Number of unique values
bill_length_mm,164
body_mass_g,94
bill_depth_mm,80
flipper_length_mm,55
species,3
island,3
sex,2


In [6]:
def delete_unique_valued_columns(data:pd.DataFrame, unique_values:pd.DataFrame):
    zero_variance_columns=unique_values.where(unique_values==1).dropna().index
    data.drop(zero_variance_columns, axis=1, inplace=True)

In [7]:
delete_unique_valued_columns(data, unique_values)

 ## Renaming columns

In [8]:
for x in data.columns:
    print(x)

species
island
bill_length_mm
bill_depth_mm
flipper_length_mm
body_mass_g
sex


In [9]:
# Renaming the column names
new_name = {'species': 'Species',
           'island': 'Island',
           'bill_length_mm': 'Bill_length',
           'bill_depth_mm': 'Bill_depth',
            'flipper_length_mm': 'Flipper_length',
            'body_mass_g': 'Body_mass',
            'sex': 'Sex'
            }

data.rename(columns = new_name, inplace = True)
data.head()

Unnamed: 0,Species,Island,Bill_length,Bill_depth,Flipper_length,Body_mass,Sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


## Looking for dupliclate data

In [10]:
duplicated_rows=data.duplicated()

In [11]:
#Show the number of duplicates
duplicated_rows.sum()

0

In [12]:
#Duplicate rows are shown
data[duplicated_rows]

Unnamed: 0,Species,Island,Bill_length,Bill_depth,Flipper_length,Body_mass,Sex


In [13]:
#Duplicares are deleted
data.drop_duplicates(inplace=True)

## Doing text pre-processing

In [14]:
#Select columns that contain text
textColumns=list(data.select_dtypes(include=['object', 'category']).columns)

In [15]:
#All text is lowered
data[textColumns]=data[textColumns].applymap(lambda x: str(x).lower())
#White spaces are deleted
data[textColumns]=data[textColumns].applymap(lambda x: str(x).strip())