# Data Preparation - Basics

## Import Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder

## Import Data

In [None]:
df = sns.load_dataset('penguins')
df.head()

In [None]:
print("Count of distinct values in column 'island':\n")
df['island'].value_counts()

In [None]:
# Get a concise summary of the dataframe
print(df.info())

In [None]:
# Get a detailed summary of the dataframe
print(df.describe())

## Drop Useless Information

In [None]:
df = df.drop(['bill_depth_mm'], axis=1)
df.head()

## Rebuild Missing Data

In [None]:
print("Columns cointaining 'na' values:\n")
print(df.isna().any())
print('\n----------------------------------------------------\n')
print("Count of 'na' values per column:\n")
print(df.isna().sum())

In [None]:
df['bill_length_mm'] = df['bill_length_mm'].fillna(df['bill_length_mm'].mean())
print("Count of 'na' values per column:\n")
print(df.isna().sum())

In [None]:
def remove_missing_values(df):
    missing_values = df.isnull().sum()
    df = df.dropna()
    print("Removed {} missing values".format(missing_values.sum()))
    return df

In [None]:
# removing the remaining missing values
df = remove_missing_values(df)

## Outliers Handling

In [None]:
def outlier_handling(df, column_with_outliers, lower_bound=0.25, upper_bound=0.75):
    q1 = df[column_with_outliers].quantile(lower_bound)
    q3 = df[column_with_outliers].quantile(upper_bound)
    iqr = q3 - q1
    df = df[(df[column_with_outliers] > (q1 - 1.5 * iqr)) & (df[column_with_outliers] < (q3 + 1.5 * iqr))]
    return df

In [None]:
print("# rows before outliers handling: {} \n...".format(df.shape[0]))
df = outlier_handling(df, column_with_outliers = ['flipper_length_mm'], lower_bound=0.25, upper_bound=0.75)
print("# rows after outliers handling: {} \n...".format(df.shape[0]))

## Standardize Data

In [None]:
###

## Normalize Data

In [None]:
###

## Removing Duplicates

In [None]:
print("Detected {} duplicated row(s):\n".format(df[df.duplicated()].shape[0]))
df[df.duplicated()]

In [None]:
df = df.drop_duplicates(subset=None, keep='first', ignore_index=False)
df = df.reset_index(drop=True)
print("Duplicated row(s) after cleaning: {}".format(df[df.duplicated()].shape[0]))

## Data Enrichment

In [None]:
# replace specific values on a specific column with a dedicated value
df['island'] = df['island'].replace(['Biscoe', 'Dream', 'Torgersen'], ['Sicilia','Sardegna','Corsica'])
df['island'].value_counts()

In [None]:
# rename column
df = df.rename(columns={"island": "italian_island"})
df = df.rename(columns=str.upper)

In [None]:
# Encoding Categorical Variables
df = pd.get_dummies(df, columns=['ITALIAN_ISLAND'])
df.head()

In [None]:
# encode into a single column
le = LabelEncoder()
df["SEX_ENCODED"] = le.fit_transform(df["SEX"])
df.head()

In [None]:
# binning
df['BODY_MASS_G_bin'] = pd.qcut(df['BODY_MASS_G'], q=3)
print(df['BODY_MASS_G_bin'].value_counts())

In [None]:
df[['BODY_MASS_G', 'BODY_MASS_G_bin']].head()