# Data Cleaning

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
data = sns.load_dataset('titanic')
data.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Renaming columns

In [4]:
data.rename(columns={'old_name':'new_name'}, inplace=True)

## Checking zero variance columns

In [4]:
def count_unique_values(data):
    
    unique_values=pd.DataFrame(data=data.nunique(), columns=["Number of unique values"])
    unique_values.sort_values(by=["Number of unique values"], ascending=False, inplace=True)
    
    return unique_values

In [5]:
unique_values=count_unique_values(data)
unique_values

Unnamed: 0,Number of unique values
bill_length_mm,164
body_mass_g,94
bill_depth_mm,80
flipper_length_mm,55
species,3
island,3
sex,2


In [6]:
def delete_unique_valued_columns(data:pd.DataFrame, unique_values:pd.DataFrame):
    zero_variance_columns=unique_values.where(unique_values==1).dropna().index
    data.drop(zero_variance_columns, axis=1, inplace=True)

In [7]:
delete_unique_valued_columns(data, unique_values)

## Dropping columns

In [None]:
columns_to_drop=['deck']
for column in columns_to_drop:
    #Column is checked to be in the dataset to avoid errors
    if column in data.columns:
      data.drop(columns=[column], inplace=True)

# Basic pre-processing

In [7]:
#Applying a function to a column doesn´t always requires neither apply nor lambda
data['embark_town'].fillna(" ").apply(lambda x: x[0]).head(3)

0    S
1    C
2    S
Name: embark_town, dtype: object

## Looking for dupliclate data

In [10]:
duplicated_rows=data.duplicated()

In [11]:
#Show the number of duplicates
duplicated_rows.sum()

0

In [12]:
#Duplicate rows are shown
data[duplicated_rows]

Unnamed: 0,Species,Island,Bill_length,Bill_depth,Flipper_length,Body_mass,Sex


In [13]:
#Duplicares are deleted
data.drop_duplicates(inplace=True)

## Doing text pre-processing

In [14]:
#Select columns that contain text
textColumns=list(data.select_dtypes(include=['object', 'category']).columns)

In [15]:
#All text is lowered
data[textColumns]=data[textColumns].applymap(lambda x: str(x).lower())
#White spaces are deleted
data[textColumns]=data[textColumns].applymap(lambda x: str(x).strip())