# Exploratory Data Analysis (EDA) and Visualization

In [1]:
import warnings

import numpy as np
import pandas as pd
import polars as pl
import polars.selectors as cs
from rich.console import Console
from rich.theme import Theme

custom_theme = Theme(
    {
        "white": "#FFFFFF",  # Bright white
        "info": "#00FF00",  # Bright green
        "warning": "#FFD700",  # Bright gold
        "error": "#FF1493",  # Deep pink
        "success": "#00FFFF",  # Cyan
        "highlight": "#FF4500",  # Orange-red
    }
)
console = Console(theme=custom_theme)

# Visualization
# import matplotlib.pyplot as plt

# NumPy settings
np.set_printoptions(precision=4)

# Pandas settings
pd.options.display.max_rows = 1_000
pd.options.display.max_columns = 1_000
pd.options.display.max_colwidth = 600

# Polars settings
pl.Config.set_fmt_str_lengths(1_000)
pl.Config.set_tbl_cols(n=1_000)
pl.Config.set_tbl_rows(n=200)

warnings.filterwarnings("ignore")

# Black code formatter (Optional)
%load_ext lab_black

# auto reload imports
%load_ext autoreload
%autoreload 2

In [2]:
def go_up_from_current_directory(*, go_up: int = 1) -> None:
    """This is used to up a number of directories.

    Params:
    -------
    go_up: int, default=1
        This indicates the number of times to go back up from the current directory.

    Returns:
    --------
    None
    """
    import os
    import sys

    CONST: str = "../"
    NUM: str = CONST * go_up

    # Goto the previous directory
    prev_directory = os.path.join(os.path.dirname(__name__), NUM)
    # Get the 'absolute path' of the previous directory
    abs_path_prev_directory = os.path.abspath(prev_directory)

    # Add the path to the System paths
    sys.path.insert(0, abs_path_prev_directory)
    print(abs_path_prev_directory)

In [3]:
fp: str = "../../../../Documents/data_dump/bike_data/database.parquet"
data: pl.DataFrame = pl.read_parquet(fp)
console.print(f"Shape: {data.shape}", style="info")

data.head()

datetime,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
"""2011-01-01 00:00:00""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
"""2011-01-01 01:00:00""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
"""2011-01-01 02:00:00""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
"""2011-01-01 03:00:00""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
"""2011-01-01 04:00:00""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
go_up_from_current_directory(go_up=1)
from src.utilities.exploratory_analysis import ExploratoryDataAnalysis  # noqa: E402

/Users/mac/Desktop/Projects/Bike-Rental-Prediction


### Perform EDA Using a Dummy Dataset

In [5]:
# Create sample data
rng = np.random.default_rng(42)
dummy_data = pd.DataFrame(
    {
        "age": rng.normal(35, 10, 1000),
        "salary": rng.exponential(50000, 1000),
        "score": rng.uniform(0, 100, 1000),
        "department": rng.choice(["Sales", "Engineering", "Marketing"], 1000),
        "experience": rng.choice(["Junior", "Mid", "Senior"], 1000, p=[0.4, 0.4, 0.2]),
        "target": rng.normal(75, 15, 1000),
    }
)

# Initialize EDA
eda = ExploratoryDataAnalysis(dummy_data, target_column="target")

# Print summary
eda.print_summary()

🚀 EXPLORATORY DATA ANALYSIS SUMMARY
* Dataset Shape: (1000, 6)
* Total Rows: 1000
* Total Columns: 6
* Numeric Columns: 4
* Categorical Columns: 2
* Boolean Columns: 0
* Total Missing Values: 0
* Memory Usage: 0.14  MB
* Target Column: target

* Numeric Columns:
  - age
  - salary
  - score
  - target

* Categorical Columns:
  - department
  - experience


 📈 Numeric Statistics:
------------------------


Unnamed: 0,column,unique_values,mean,median,mode,std,variance,range,iqr_value,min,max,skewness,kurtosis,outlier_series_iqr,outlier_count_iqr,outlier_series_zscore,outlier_count_zscore,total_count,missing_values,missing_pct
0,age,1000,34.71,35.06,"[-1.484128252147836, 5.354711621583488, 5.694056241472243, 8.27165566897062, 9.333415590687025]",9.89,97.86,68.27,12.85,-1.484128,66.788537,-0.04,0.09,"[64.13862466007329, 64.05067169240407, 60.97673726595832, 5.354711621583488, 60.493279526070026, 8.27165566897062, 66.78853679367535, 5.694056241472243, -1.484128252147836]",9,"[66.78853679367535, -1.484128252147836]",2,1000,0,0.0
1,salary,1000,50779.11,35946.37,"[3.5613078892469914, 24.211398874060155, 69.04754231018997, 217.9790962482941, 258.1741230171644]",51331.62,2634935000.0,380729.73,52288.49,3.561308,380733.295997,2.19,6.77,"[146705.34178597206, 154204.84161484925, 153740.71609812017, 147904.47759525033, 151917.71375841933, 253167.77770065912, 157269.89928487997, 360298.49674193806, 170999.81743184445, 170698.77331384047, 210331.31450193742, 214676.20195246505, 200758.77227751815, 273001.95058850385, 164759.1984695182, 206805.300670859, 185861.67573001108, 204996.64619461817, 166887.37012708353, 159640.26905626358, 197891.03854546923, 170904.54196516602, 216698.2336669214, 175121.85537360838, 182697.56183844653, 150653.01143792793, 158918.01517343533, 167692.78751424476, 154574.31599516908, 194282.12401564815,...",59,"[253167.77770065912, 360298.49674193806, 210331.31450193742, 214676.20195246505, 273001.95058850385, 206805.300670859, 204996.64619461817, 216698.2336669214, 232429.8406485817, 249012.06535477925, 231430.4333272335, 311165.03347140463, 380733.2959969836, 231695.56192048496, 222590.82524600177, 298551.2089986543, 330203.9373396617, 311933.27316453395, 240566.13634938814, 236585.0466611581]",20,1000,0,0.0
2,score,1000,49.57,49.89,"[0.09899939263861013, 0.27072736757871585, 0.38941463811326127, 0.4857337777279036, 0.49849652944728984]",28.78,828.26,99.88,50.14,0.098999,99.976534,0.02,-1.19,[],0,[],0,1000,0,0.0
3,target,1000,74.83,75.01,"[27.84500546962884, 28.342679023209936, 28.904545937680503, 31.95569962717565, 32.81479509740968]",15.03,226.0,95.77,20.47,27.845005,123.618886,-0.05,0.04,"[28.904545937680503, 32.81479509740968, 115.96933063739327, 27.84500546962884, 117.86209613197059, 28.342679023209936, 31.95569962717565, 119.94358643097215, 123.61888615465179]",9,"[28.904545937680503, 27.84500546962884, 28.342679023209936, 119.94358643097215, 123.61888615465179]",5,1000,0,0.0



 📈 Categorical Statistics:
----------------------------


Unnamed: 0,column,total_count,unique_values,entropy,value_counts,missing_values,missing_pct
0,department,1000,3,0.48,"[[Sales, 367], [Marketing, 319], [Engineering, 314]]",0,0.0
1,experience,1000,3,0.45,"[[Junior, 415], [Mid, 398], [Senior, 187]]",0,0.0


In [6]:
eda.display_all_plots()

## EDA on Original Dataset

In [7]:
numerical_cols = data.select(cs.numeric()).columns

console.print(f"Numerical columns: {numerical_cols}", style="info")
console.print(f"Total numerical columns: {len(numerical_cols)}", style="info")

In [8]:
categorical_cols = data.select(cs.string()).columns

console.print(f"Categorical columns: {categorical_cols}", style="info")
console.print(f"Total categorical columns: {len(categorical_cols)}", style="info")

In [10]:
kwargs = {"height": 900, "width": 1500}
data_explorer = ExploratoryDataAnalysis(data=data, target_column="cnt")

# Print summary
data_explorer.print_summary()

🚀 EXPLORATORY DATA ANALYSIS SUMMARY
* Dataset Shape: (13903, 16)
* Total Rows: 13903
* Total Columns: 16
* Numeric Columns: 15
* Categorical Columns: 1
* Boolean Columns: 0
* Total Missing Values: 0
* Memory Usage: 1.84  MB
* Target Column: cnt

* Numeric Columns:
  - season
  - yr
  - mnth
  - hr
  - holiday
  - weekday
  - workingday
  - weathersit
  - temp
  - atemp
  - hum
  - windspeed
  - casual
  - registered
  - cnt

* Categorical Columns:
  - datetime


 📈 Numeric Statistics:
------------------------


column,unique_values,mean,median,mode,std,variance,range,iqr_value,min,max,skewness,kurtosis,outlier_series_iqr,outlier_count_iqr,outlier_series_zscore,outlier_count_zscore,total_count,missing_values,missing_pct
str,i64,f64,f64,list[f64],f64,f64,f64,f64,f64,f64,f64,f64,list[f64],i64,list[f64],i64,i64,i64,f64
"""season""",4,2.26,2.0,[2.0],1.04,1.07,3.0,2.0,1.0,4.0,0.28,-1.09,[],0,[],0,13903,0,0.0
"""yr""",2,0.38,0.0,[0.0],0.48,0.24,1.0,1.0,0.0,1.0,0.5,-1.75,[],0,[],0,13903,0,0.0
"""mnth""",12,5.65,5.0,"[7.0, 5.0]",3.22,10.36,11.0,5.0,1.0,12.0,0.35,-0.87,[],0,[],0,13903,0,0.0
"""hr""",24,11.55,12.0,"[16.0, 17.0]",6.91,47.78,23.0,12.0,0.0,23.0,-0.01,-1.2,[],0,[],0,13903,0,0.0
"""holiday""",2,0.03,0.0,[0.0],0.16,0.03,1.0,0.0,0.0,1.0,5.79,31.52,"[1.0, 1.0, … 1.0]",381,"[1.0, 1.0, … 1.0]",381,13903,0,0.0
"""weekday""",7,3.0,3.0,[6.0],2.01,4.02,6.0,4.0,0.0,6.0,0.0,-1.26,[],0,[],0,13903,0,0.0
"""workingday""",2,0.68,1.0,[1.0],0.46,0.22,1.0,1.0,0.0,1.0,-0.79,-1.37,[],0,[],0,13903,0,0.0
"""weathersit""",4,1.42,1.0,[1.0],0.64,0.41,3.0,1.0,1.0,4.0,1.27,0.42,"[4.0, 4.0, 4.0]",3,"[4.0, 4.0, 4.0]",3,13903,0,0.0
"""temp""",50,0.5,0.5,[0.62],0.2,0.04,0.98,0.32,0.02,1.0,-0.03,-0.94,[],0,[],0,13903,0,0.0
"""atemp""",65,0.48,0.48,[0.6212],0.18,0.03,1.0,0.29,0.0,1.0,-0.12,-0.83,[],0,[],0,13903,0,0.0



 📈 Categorical Statistics:
----------------------------


column,total_count,unique_values,entropy,value_counts,missing_values,missing_pct
str,i64,i64,f64,list[list[str]],i64,f64
"""datetime""",13903,13903,4.14,"[[""2011-01-01 00:00:00"", ""1""], [""2011-01-01 01:00:00"", ""1""], … [""2012-08-07 11:00:00"", ""1""]]",0,0.0


In [11]:
data_explorer.plot_correlation_heatmap(method="spearman", **kwargs)

In [12]:
data_explorer.plot_correlation_with_target()

In [13]:
data_explorer.plot_numeric_distribution(plot_type="histogram")

### Convert The Data To The Appropriate Types For Further Analysis

In [14]:
kwargs = {"height": 900, "width": 1500}
data_copy: pl.DataFrame = data.clone().with_columns(
    pl.col("season").cast(pl.Utf8),
    pl.col("hr").cast(pl.Utf8),
    pl.col("yr").cast(pl.Utf8),
    pl.col("mnth").cast(pl.Utf8),
    pl.col("holiday").cast(pl.Utf8),
    pl.col("weekday").cast(pl.Utf8),
    pl.col("workingday").cast(pl.Utf8),
    pl.col("weathersit").cast(pl.Utf8),
)
data_explorer = ExploratoryDataAnalysis(data=data_copy, target_column="cnt")


# Print summary
data_explorer.print_summary()

🚀 EXPLORATORY DATA ANALYSIS SUMMARY
* Dataset Shape: (13903, 16)
* Total Rows: 13903
* Total Columns: 16
* Numeric Columns: 7
* Categorical Columns: 9
* Boolean Columns: 0
* Total Missing Values: 0
* Memory Usage: 1.11  MB
* Target Column: cnt

* Numeric Columns:
  - temp
  - atemp
  - hum
  - windspeed
  - casual
  - registered
  - cnt

* Categorical Columns:
  - datetime
  - season
  - yr
  - mnth
  - hr
  - holiday
  - weekday
  - workingday
  - weathersit


 📈 Numeric Statistics:
------------------------


column,unique_values,mean,median,mode,std,variance,range,iqr_value,min,max,skewness,kurtosis,outlier_series_iqr,outlier_count_iqr,outlier_series_zscore,outlier_count_zscore,total_count,missing_values,missing_pct
str,i64,f64,f64,list[f64],f64,f64,f64,f64,f64,f64,f64,f64,list[f64],i64,list[f64],i64,i64,i64,f64
"""temp""",50,0.5,0.5,[0.62],0.2,0.04,0.98,0.32,0.02,1.0,-0.03,-0.94,[],0,[],0,13903,0,0.0
"""atemp""",65,0.48,0.48,[0.6212],0.18,0.03,1.0,0.29,0.0,1.0,-0.12,-0.83,[],0,[],0,13903,0,0.0
"""hum""",88,0.62,0.62,[0.88],0.2,0.04,1.0,0.32,0.0,1.0,-0.1,-0.85,[],0,"[0.0, 0.0, … 0.0]",22,13903,0,0.0
"""windspeed""",30,0.19,0.19,[0.0],0.12,0.02,0.85,0.18,0.0,0.8507,0.58,0.62,"[0.5821, 0.5821, … 0.6119]",91,"[0.5821, 0.5821, … 0.6119]",91,13903,0,0.0
"""casual""",303,34.04,15.0,[0.0],47.34,2241.19,367.0,42.0,0.0,367.0,2.51,7.73,"[144.0, 149.0, … 112.0]",984,"[219.0, 240.0, … 208.0]",375,13903,0,0.0
"""registered""",701,140.6,107.0,[4.0],137.51,18909.65,796.0,171.0,0.0,796.0,1.56,2.8,"[463.0, 486.0, … 664.0]",561,"[555.0, 567.0, … 664.0]",264,13903,0,0.0
"""cnt""",789,174.64,130.0,[5.0],166.96,27874.11,956.0,221.0,1.0,957.0,1.28,1.46,"[598.0, 611.0, … 705.0]",396,"[712.0, 676.0, … 705.0]",189,13903,0,0.0



 📈 Categorical Statistics:
----------------------------


column,total_count,unique_values,entropy,value_counts,missing_values,missing_pct
str,i64,i64,f64,list[list[str]],i64,f64
"""datetime""",13903,13903,4.14,"[[""2011-01-01 00:00:00"", ""1""], [""2011-01-01 01:00:00"", ""1""], … [""2012-08-07 11:00:00"", ""1""]]",0,0.0
"""season""",13903,4,0.59,"[[""2"", ""4409""], [""1"", ""3980""], … [""4"", ""2134""]]",0,0.0
"""yr""",13903,2,0.29,"[[""0"", ""8645""], [""1"", ""5258""]]",0,0.0
"""mnth""",13903,12,1.06,"[[""5"", ""1488""], [""7"", ""1488""], … [""9"", ""717""]]",0,0.0
"""hr""",13903,24,1.38,"[[""16"", ""584""], [""17"", ""584""], … [""4"", ""554""]]",0,0.0
"""holiday""",13903,2,0.05,"[[""0"", ""13522""], [""1"", ""381""]]",0,0.0
"""weekday""",13903,7,0.85,"[[""6"", ""2008""], [""1"", ""1999""], … [""4"", ""1969""]]",0,0.0
"""workingday""",13903,2,0.27,"[[""1"", ""9516""], [""0"", ""4387""]]",0,0.0
"""weathersit""",13903,4,0.36,"[[""1"", ""9251""], [""2"", ""3468""], … [""4"", ""3""]]",0,0.0


In [15]:
data_explorer.plot_numeric_distribution(plot_type="histogram")

In [16]:
data_explorer.plot_numeric_distribution(plot_type="box")

In [17]:
data_explorer.plot_categorical_distribution(plot_type="bar")

### Relationship With The Target Variable

In [18]:
data_explorer.group_analysis(groupby="hr", numeric_cols=["cnt"]).with_columns(
    pl.col("hr").cast(pl.Int8)
).sort("hr").sort("cnt_median")

hr,cnt_count,cnt_mean,cnt_median,cnt_std,cnt_min,cnt_max
i8,u32,f64,f64,f64,i64,i64
4,554,5.77,5.0,3.89,1,23
3,556,11.12,6.0,12.48,1,66
2,571,21.51,11.0,24.79,1,122
5,573,17.7,17.0,11.55,1,59
1,580,30.84,18.0,30.65,1,168
0,581,50.06,38.0,38.93,2,206
6,581,69.8,71.0,50.58,1,213
23,582,82.15,73.0,48.93,2,256
22,582,124.82,119.5,68.85,9,502
10,583,159.26,135.0,94.51,10,494


In [19]:
data_explorer.plot_group_analysis(
    groupby="hr",
    numeric_col="cnt",
    sortby="cnt",
    plot_type="bar",
)

In [20]:
data_explorer.plot_scatter(x_col="temp", y_col="cnt", **{"trendline": "ols"})

In [21]:
data_explorer.plot_group_analysis(
    groupby="weathersit",
    numeric_col="cnt",
    sortby="cnt",
    plot_type="bar",
)

<br><hr><hr>

## Observations

<br>

### Data Information
---

- **Datetime**: The data contains the date and time information in an **hourly** fashion.
- **Target variable**: The target variable is `cnt`.
- **Missing records**: The data has no missing records.

### Distributions
---

- `season`, `yr`, `hr`, `mnth`, `holiday`, `weekday`, `workingday`, `weathersit` are actually categorical variables.
- The target variable `cnt` is positively skewed with a very long tail.
- There's more of `season`=2, `season`=1 (summer and spring) compared to `season`=3 and `season`=4 (fall and winter).
- The ratio of 2011 to 2012 is roughly **3:2**.
- There's lesser records for months between `8` to `12` (Aug to Dec) compared to other periods of the year.
- There's obviously more non-holidays than holidays.
- There's also more working days (non-weekends and non-holidays) than non-working days.
- There are more days with `weathersit`=1 (clear) than mist, light and heavy rain (2, 3, 4)

### Correlation (This is NOT Causation)
---

- `season` and `mnth` are highly correlated (~0.88). They're also mildly correlated with with temperature (temp and atemp).
- There's a negative correlation between `hr` and `hum`. 
- There's also a negative correlation between `holiday` and `workinday`.
- Temperature (`temp` and `atemp`) are ~100% correlated. `windspeed` and `hum` are negatively correlated with temperature.
- The target variable `cnt` is positively correlated with `hr`, `temperature` and negatively correlated with `hum`.

### Relationships With The Target (`cnt`)
---

- There's a high demand for bikes between the hours `15` to `20`.Demand is the highest at `16` and `17`.
- There's also a high demand between `7` to `9` in the morning.
- There's a very low demand in the morning between `0` to `5` hours.
- As expected, demand for bikes is higher when the weather (`weathersit`=1 or `weathersit`=2). i.e. (when the weather is clear).

<br><hr><hr><br>
