# Exploratory Data Analysis and Cleaning

## Importing packages

In [1]:
import polars as pl
from project_d100.data import load_data, summary, extract_day, denormalize

In [2]:
# Configure the display of tables
pl.Config.set_tbl_width_chars(200)
pl.Config.set_tbl_cols(-1)
pl.Config.set_tbl_rows(10)

polars.config.Config

## EDA: Part 1

In [3]:
df = load_data()
df.head(n=5)

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
i64,str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
1,"""2011-01-01""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,"""2011-01-01""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,"""2011-01-01""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,"""2011-01-01""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,"""2011-01-01""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [4]:
summary(df)


Data Shape (rows, columns)
(17379, 17)

Data Types of Columns
[Int64, String, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Int64, Float64, Float64, Float64, Float64, Int64, Int64, Int64]

Statistical Summary
shape: (9, 18)
┌───────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┬──────────┐
│ statistic ┆ instant  ┆ dteday   ┆ season   ┆ yr       ┆ mnth     ┆ hr       ┆ holiday  ┆ weekday  ┆ workingd ┆ weathers ┆ temp     ┆ atemp    ┆ hum      ┆ windspee ┆ casual   ┆ register ┆ cnt      │
│ ---       ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ---      ┆ ay       ┆ it       ┆ ---      ┆ ---      ┆ ---      ┆ d        ┆ ---      ┆ ed       ┆ ---      │
│ str       ┆ f64      ┆ str      ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ f64      ┆ ---      ┆ ---      ┆ f64      ┆ f64      ┆ f64      ┆ ---      ┆ f

instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
i64,str,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64,i64,i64,i64
1,"""2011-01-01""",1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,"""2011-01-01""",1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,"""2011-01-01""",1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,"""2011-01-01""",1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,"""2011-01-01""",1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
17375,"""2012-12-31""",1,1,12,19,0,1,1,2,0.26,0.2576,0.6,0.1642,11,108,119
17376,"""2012-12-31""",1,1,12,20,0,1,1,2,0.26,0.2576,0.6,0.1642,8,81,89
17377,"""2012-12-31""",1,1,12,21,0,1,1,1,0.26,0.2576,0.6,0.1642,7,83,90
17378,"""2012-12-31""",1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61


## TODO from summary(df):
- season, year, and mnth features were extracted from dteday (I extract day, then drop dteday)
- The following features need to be turned into categoricals: season, year, mnth, hr, holiday, weekday, workingday, weathersit
- De-normalise data for following features: temp (41), atemp (50), hum (100), windspeed (67)
    - According to Readme.txt, they normalised data by dividing by max values (in the parentheses above)


In [5]:
df = extract_day(df, "dteday")

In [6]:
normalized_cols = {"temp" : 41, "atemp" : 50, "hum" : 100, "windspeed" : 67}
for col, value in normalized_cols.items():
    df = denormalize(df, col, value)

## EDA: Part 2