## Data Overview, Basic Checks, Conversion, and EDA

### Imports and Loading Data

In [None]:
# Imports for manipulating data
import pandas as pd
import numpy as np

In [None]:
# Load data
ds = pd.read_csv("Data.csv")

### Overview and Basic Checks

In [None]:
# First and last 10 entries
ds.head(10)
ds.tail(10)

In [None]:
# Determine dimensions of data
ds.shape

In [None]:
# Check for duplicates
ds[ds.duplicated()].count()

In [None]:
# Check for null values
ds.isnull().sum()

In [None]:
# Check if id is unique (if there is id column)
ds['id'].is_unique

### Type Conversion

In [None]:
# Summary of columns, null count, types, dimension
ds.info()

In [None]:
# Convert object types to categorical types
cols = ['cat1','cat2', 'cat3']
ds[cols] = ds[cols].astype('category')

In [None]:
# Describe numerical attributes
ds.describe().T

In [None]:
# Describe categorical attributes
ds.describe(include=["category"])

## Data Cleaning

### Drop NaN and duplicates

In [None]:
# Drop NaN and duplicates
ds = ds.dropna()
ds = ds.drop_duplicates()

### Imputation

In [None]:
# Imputation with mean and median
ds = ds.fillna(ds.mean()) 
ds = ds.fillna(ds.mode()) 

### Treatment of Outliers

In [None]:
# One way to remove potential outliers is to only consider data in the interquartile range
def IQR_clipping(x): 
    q1 = x.quantile(.25)
    q3 = x.quantile(.75)
    
    IQR = q3-q1
    q1_clip = q1-(1.5*IQR)
    q3_clip = q3+(1.5*IQR)

    return x.clip(q1_clip, q3_clip)

## EDA

### Imports for Data Visualization

In [None]:
# seaborn and matplotlib for data visualization
# Reference list of seaborn plots: https://seaborn.pydata.org/api.html, gallery: https://seaborn.pydata.org/examples/index.html
import seaborn as sns
import matplotlib.pyplot as plt

### Plotting univariate distributions

In [None]:
# Histogram for numerical data
sns.histplot(ds.num1)

In [None]:
# Plot distribution
sns.displot(ds['num'])

In [None]:
# Boxplot for numerical data
sns.boxplot(data=ds, x="num", y="cat")

In [None]:
# Stripplot for categorical data
sns.stripplot(data=ds, x='num', jitter=True);

In [None]:
# Barplot and pointplot for numerical data grouped by a categorical attribute
sns.barplot(data=ds, x="cat", y="num")
sns.pointplot(data=ds, x="cat", y="num")

### Plotting bivariate distributions

In [None]:
# Plotting bivariate distributions
sns.jointplot(data = ds, x = 'num1', y = 'num2');
sns.jointplot(data = ds, x = 'num1', y = 'num2', kind="hex");
sns.jointplot(data = ds, x = 'num1', y = 'num2', kind="kde");

In [None]:
# Plotting pairwise bivariate distributions
sns.pairplot(ds[['num1', 'num2', 'num3']]);

In [None]:
# Linear relationship between two variables
sns.lmplot(data=ds, x="num1", y="num2")

In [None]:
# Heatmap using correlations
ds_corr = ds.corr()
sns.heatmap(ds_corr, annot=True)