# Data Validation

This notebook will be used for data validation.
1. Pre-processing data (e.g., scale and split into train & test)
2. Validate data (screating schema)

# 1. Pre-processing data
First, we import file with permission. Then, study the raw data

In [None]:
import pandas as pd
import numpy as np
import pandera as pa
from pandera.typing import DataFrame
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
import os

file_path = '../data/ttc-bus-delay-data-2024.csv'

if os.path.exists(file_path) and file_path.endswith('.csv'):
    try:
        # Load the CSV file and parse dates
        ttc = pd.read_csv(file_path, parse_dates=['Date'])
        print("File loaded successfully!")
    except Exception as e:
        print(f"Error loading file: {e}")
else:
    print("Error: File is either missing or not in CSV format.")

File loaded successfully!


  ttc = pd.read_csv(file_path, parse_dates=['Date'])


In [3]:
ttc = pd.read_csv('../data/ttc-bus-delay-data-2024.csv', parse_dates=['Date'])
ttc.head()

  ttc = pd.read_csv('../data/ttc-bus-delay-data-2024.csv', parse_dates=['Date'])


Unnamed: 0,Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle
0,2024-01-01,89,02:08,Monday,KEELE AND GLENLAKE,Vision,10,20,N,7107
1,2024-01-01,39,02:30,Monday,FINCH STATION,General Delay,20,40,,8914
2,2024-01-01,300,03:13,Monday,BLOOR AND MANNING,General Delay,0,0,,8562
3,2024-01-01,65,03:23,Monday,PARLIAMENT AND BLOOR,Security,0,0,N,8574
4,2024-01-01,113,03:37,Monday,MAIN STATION,Security,0,0,,8541


In [4]:
ttc['Route'].isnull().sum()

np.int64(439)

In [5]:
ttc['Direction'].unique()

array(['N', nan, 'S', 'W', 'E', '\\', 'B', 'G', '9', '0', '2', '7', '8'],
      dtype=object)

In [6]:
ttc1 = ttc.copy()
ttc1['Time'] = pd.to_datetime(ttc['Time']).dt.time
ttc1['Date_'] = ttc1['Date'].dt.date
ttc1['Date_'] = pd.to_datetime(ttc1['Date_'])
ttc1['Month'] = ttc1['Date'].dt.month.astype("int64")
ttc1['Hour'] = ttc1['Time'].map(lambda x: x.hour)
ttc1 = ttc1.drop(columns=['Date', 'Time'])

# final dataset
ttc_clean = ttc1.drop(columns=['Direction', 'Vehicle'])
ttc_clean = ttc_clean.dropna()
ttc_clean.isna().sum()

  ttc1['Time'] = pd.to_datetime(ttc['Time']).dt.time


Route        0
Day          0
Location     0
Incident     0
Min Delay    0
Min Gap      0
Date_        0
Month        0
Hour         0
dtype: int64

In [7]:
ttc_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 44861 entries, 0 to 45299
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Route      44861 non-null  object        
 1   Day        44861 non-null  object        
 2   Location   44861 non-null  object        
 3   Incident   44861 non-null  object        
 4   Min Delay  44861 non-null  int64         
 5   Min Gap    44861 non-null  int64         
 6   Date_      44861 non-null  datetime64[ns]
 7   Month      44861 non-null  int64         
 8   Hour       44861 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(4)
memory usage: 3.4+ MB


## 2. Validation Schema
Now we will create schema for all the columns and a couple other validations in a function

In [8]:
# validate data
schema = pa.DataFrameSchema(
    {
        "Route": pa.Column(str, nullable=True),
        "Day": pa.Column(str, checks=[pa.Check.isin(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"], error="Day must be a valid weekday")]),
        "Location": pa.Column(str),
        "Incident": pa.Column(str, checks=[pa.Check.isin(
                    ["Cleaning - Unsanitary", "Collision - TTC", "Mechanical", "Operations - Operator", "Diversion", "Emergency Services", "Utilized Off Route", "Investigation", "Road Blocked - NON-TTC Collision", "Vision", "General Delay", "Security"], # known incident types
                    error="Incident must be one of: Vision, General Delay, Security")]),
        "Min Delay": pa.Column(int, checks=[pa.Check.ge(0, error="Min Delay must be non-negative")]),
        "Min Gap": pa.Column(int, checks=[pa.Check.ge(0, error="Min Gap must be non-negative")]),
        "Date_": pa.Column(pd.Timestamp, checks=pa.Check(lambda x: x.apply(lambda d: isinstance(d, (str, pd.Timestamp))),error="Date must be string or datetime")), # used ChatGPT to get this datetime check
        "Month": pa.Column(int, checks=[pa.Check.ge(1, error="Month must be >= 1"), pa.Check.le(12, error="Month must be <= 12")]),
        "Hour": pa.Column(int,  checks=[pa.Check.ge(0, error="Hour must be >= 0"), pa.Check.le(23, error="Hour must be <= 23")]),
    },
    checks=[
        # not checking for duplicate rows here
        pa.Check(lambda df: ~(df.isna().all(axis=1)).any(), error="Empty rows found"),
        pa.Check(lambda df: df["Min Delay"].isna().mean() <= 0.05, error="Min Delay missingness exceeds 5%"),
        pa.Check(lambda df: df["Min Gap"].isna().mean() <= 0.05, error="Min Gap missingness exceeds 5%"),
    ],
)

schema.validate(ttc_clean, lazy=True)

Unnamed: 0,Route,Day,Location,Incident,Min Delay,Min Gap,Date_,Month,Hour
0,89,Monday,KEELE AND GLENLAKE,Vision,10,20,2024-01-01,1,2
1,39,Monday,FINCH STATION,General Delay,20,40,2024-01-01,1,2
2,300,Monday,BLOOR AND MANNING,General Delay,0,0,2024-01-01,1,3
3,65,Monday,PARLIAMENT AND BLOOR,Security,0,0,2024-01-01,1,3
4,113,Monday,MAIN STATION,Security,0,0,2024-01-01,1,3
...,...,...,...,...,...,...,...,...,...
45295,63,Monday,KING AND DOWLING,Vision,10,20,2024-09-30,9,1
45296,32,Monday,EGLINTON AND OAKWOOD,Vision,13,18,2024-09-30,9,1
45297,63,Monday,OSSINGTON STATION,Vision,10,20,2024-09-30,9,1
45298,31,Monday,COXWELL STATION,Emergency Services,17,34,2024-09-30,9,1


#### Other checks outside of schema: 
There are other data validation checks to perform for the dataset, including flagging duplicates, determining correlation etc. 
For this, we will create a function so it stays contained and can be reused again later. 

In [9]:
# additional data validation for the dataframe
def additional_validations(df: DataFrame):
    # check for correct column names
    required_columns = {"Date", "Route", "Time", "Day", "Location", "Incident", "Min Delay", "Min Gap", "Direction", "Vehicle"}
    missing_columns = required_columns - set(df.columns)
    assert not missing_columns, f"Missing columns: {missing_columns}"

    # check for outliers (example: Min Delay or Min Gap unreasonably large)
    assert (df["Min Delay"] <= 1440).all(), "Outlier found in Min Delay"
    assert (df["Min Gap"] <= 1440).all(), "Outlier found in Min Gap"

    # check category levels
    assert df["Day"].isin(["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]).all(), \
        "Invalid Day values"
    assert df["Incident"].isin(["Cleaning - Unsanitary", "Collision - TTC", "Mechanical", "Operations - Operator", "Diversion", "Emergency Services", "Utilized Off Route", "Investigation", "Road Blocked - NON-TTC Collision", "Vision", "General Delay", "Security"]).all(), "Invalid Incident values"

    # target variable distribution (example: ensure Min Delay isn't mostly 0)
    assert (df["Min Delay"] > 0).mean() > 0.05, "Min Delay mostly zero, check for data skewness"

    # check for duplicate rows
    duplicates = df[df.duplicated()]
    if not duplicates.empty:
        print("Duplicate rows found:\n", duplicates)
    else:
        print("No duplicate rows found.")

    # correlation checks
    corr_matrix = df.corr(numeric_only=True)
    assert not corr_matrix.isnull().values.any(), "Anomalous correlations detected in numeric features"
    print("All checks passed")


additional_validations(ttc)

Duplicate rows found:
             Date Route   Time        Day                Location  \
246   2024-01-02    89  19:54    Tuesday      WILSON AND CLAYSON   
1320  2024-01-09    72  15:29    Tuesday           FRONT AND BAY   
1323  2024-01-09    95  15:33    Tuesday   ELLESMERE AND KENNEDY   
1449  2024-01-10   939  07:29  Wednesday       FINCH AND BAYVIEW   
1688  2024-01-11     8  18:17   Thursday          WARDEN STATION   
...          ...   ...    ...        ...                     ...   
43894 2024-09-22   109  07:38     Sunday   LAWRENCE WEST STATION   
44085 2024-09-23    51  13:10     Monday  LESLIE AND THORNY VINE   
44368 2024-09-25   927  10:37  Wednesday   HIGHWAY 27 AND QUEENS   
45061 2024-09-29   929  15:03     Sunday          WILSON STATION   
45080 2024-09-29    44  16:58     Sunday  KIPLING AND NEW TORONT   

                    Incident  Min Delay  Min Gap Direction  Vehicle  
246               Mechanical         10       20         N     8169  
1320              Me

In [10]:
ttc_lr = ttc_clean.loc[(ttc_clean["Min Delay"]<30) & (ttc_clean["Min Delay"]>0)].reset_index(drop=True)
ttc_lr

Unnamed: 0,Route,Day,Location,Incident,Min Delay,Min Gap,Date_,Month,Hour
0,89,Monday,KEELE AND GLENLAKE,Vision,10,20,2024-01-01,1,2
1,39,Monday,FINCH STATION,General Delay,20,40,2024-01-01,1,2
2,320,Monday,YONGE AND QUEENSQUAY,Operations - Operator,8,16,2024-01-01,1,3
3,171,Monday,MOUNT DENNIS GARAGE,General Delay,20,20,2024-01-01,1,4
4,12,Monday,VICTORIA PARK AND DANF,Emergency Services,21,42,2024-01-01,1,4
...,...,...,...,...,...,...,...,...,...
35251,63,Monday,KING AND DOWLING,Vision,10,20,2024-09-30,9,1
35252,32,Monday,EGLINTON AND OAKWOOD,Vision,13,18,2024-09-30,9,1
35253,63,Monday,OSSINGTON STATION,Vision,10,20,2024-09-30,9,1
35254,31,Monday,COXWELL STATION,Emergency Services,17,34,2024-09-30,9,1


In [11]:
#Split dataset into target and features
X = ttc_lr[["Route","Incident","Location","Day","Hour","Month"]]
y = ttc_lr['Min Delay']

In [12]:
numeric_features=["Hour","Month"]
categorical_features = ['Location', 'Route', 'Incident',"Day"]

In [13]:
#Create transformers and preprocessing pipeline
preprocessor = ColumnTransformer(
   transformers=[
       ('num', StandardScaler(), numeric_features),
       ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
   ]
)
preprocessor

In [14]:
#Create Model pipeline
model_pipeline = Pipeline(steps=[
   ('preprocessor', preprocessor),
   ('model', LogisticRegression(random_state=123, max_iter=2000))
])
model_pipeline

In [15]:
#Split dataset into train and test data

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)