# 🧪 Exploratory Data Analysis: MTA Ridership Dataset
This notebook performs exploratory data analysis on the Metropolitan Transportation Authority (MTA) dataset to identify key features for ML modeling.

In [None]:
# 📦 Install dependencies (uncomment if needed)
# !pip install pandas matplotlib seaborn

In [None]:
# 📚 Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')

In [None]:
# 📥 Load dataset
# Replace the path with your actual data file path
df = pd.read_csv('../data_ingestion/mta_data/your_mta_file.csv')
df.head()

In [None]:
# ℹ️ Basic dataset information
df.info()

In [None]:
# 📊 Summary statistics
df.describe(include='all')

In [None]:
# ❓ Check for missing values
df.isnull().sum()

In [None]:
# 📅 Convert date column if available
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

In [None]:
# 📈 Monthly ridership trend (if date and ridership columns exist)
if 'date' in df.columns and 'ridership' in df.columns:
    df.set_index('date').resample('M')['ridership'].sum().plot(figsize=(12,6), title='Monthly Ridership Over Time')
    plt.ylabel('Total Ridership')
    plt.xlabel('Date')
    plt.show()

In [None]:
# 🏙️ Top stations by ridership (if station column exists)
if 'station' in df.columns and 'ridership' in df.columns:
    df.groupby('station')['ridership'].sum().sort_values(ascending=False).head(10).plot(kind='bar', figsize=(10,6), title='Top 10 Stations by Ridership')
    plt.ylabel('Total Ridership')
    plt.xlabel('Station')
    plt.xticks(rotation=45)
    plt.show()