# Exploratory Data Analysis

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/darenasc/eda/blob/main/notebooks/eda_dkuk_sds.ipynb)

Notebook prepared for DataKindUK SDS meeting on 11.12.2023

In [None]:
# Run the following line if you are using Colab
# !pip install pandas ydata-profiling sweetviz pygwalker openml

In [None]:
import pandas as pd
import pygwalker as pyg
import sweetviz as sv
from openml.datasets import get_dataset
from ydata_profiling import ProfileReport

In [None]:
# https://www.openml.org/d/40945
dataset = get_dataset(40945)

In [None]:
print(f"{dataset.name}, {dataset.url}")

In [None]:
df_titanic, *_ = dataset.get_data()

## pandas

In [None]:
df_titanic.head()

In [None]:
df_titanic.tail()

In [None]:
df_titanic.shape

In [None]:
df_titanic.info()

In [None]:
df_titanic.describe()

In [None]:
df_titanic.hist(figsize=(10, 10))

In [None]:
# df_titanic.boxplot()
df_titanic[["age"]].boxplot()
# df_titanic[["fare"]].boxplot()

### pandas html

In [None]:
# Data collection from tables using pandas
# On Mac: Press "command + space" button or open Spotlight
# type "Install Certificates.command"
url = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
dfs = pd.read_html(url)

dfs[0].head()
# dfs[0].tail()

In [None]:
dfs[0]
# dfs[0].head().plot.bar(x='Country / Area')
n_countries = 5
dfs[0].head(n_countries).plot(
    kind="barh",
    x="Country / Area",
    title=f"Top {n_countries} countries by population",
)
dfs[0][:-1].sort_values(by=["Population (1 July 2023)"], ascending=False).tail(
    n_countries
).plot(
    kind="bar",
    x="Country / Area",
    title=f"Bottom {n_countries} countries by population",
)

In [None]:
# Export to CSV
df_titanic.to_csv("../data/titanic.csv", index=False)

## SandDance (vscode extension)

Right click on a CSV file and click on "View in SandDance"

## ydata-profiling

In [None]:
profile = ProfileReport(df_titanic, title="Profiling Report")
profile.to_notebook_iframe()

## sweetviz

In [None]:
my_report = sv.analyze(df_titanic)
my_report.show_html()  # Default arguments will generate to "SWEETVIZ_REPORT.html"

In [None]:
# Create a bool survived target variable
df_titanic["survived_bool"] = None
for i, r in df_titanic.iterrows():
    df_titanic.at[i, "survived_bool"] = True if r["survived"] == "1" else False

In [None]:
my_report = sv.analyze(df_titanic, target_feat="survived_bool")
my_report.show_html()

## pygwalker

In [None]:
walker = pyg.walk(df_titanic)