In [28]:
import sys
!{sys.executable} -m pip install folium


Collecting folium
  Using cached folium-0.20.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting branca>=0.6.0 (from folium)
  Using cached branca-0.8.1-py3-none-any.whl.metadata (1.5 kB)
Using cached folium-0.20.0-py2.py3-none-any.whl (113 kB)
Using cached branca-0.8.1-py3-none-any.whl (26 kB)
Installing collected packages: branca, folium
Successfully installed branca-0.8.1 folium-0.20.0


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import glob
import os
import json
from datetime import datetime, timedelta
import warnings
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import folium
from folium.plugins import HeatMap, MarkerCluster

warnings.filterwarnings('ignore')

# Advanced plotting configuration
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
sns.set_style("whitegrid")
color_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']

print("EV PERFORMANCE ANALYSIS")
print("=" * 60)
print(f"Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M')}")
print("Project: EV Industry Market Intelligence")
print("=" * 60)

EV PERFORMANCE ANALYSIS
Analysis Date: 2025-08-29 00:28
Project: EV Industry Market Intelligence


In [8]:
def load_ev_datasets():
    """Enhanced data loading with comprehensive validation"""
    
    # Navigate to project root
    if os.path.basename(os.getcwd()) == 'notebooks':
        os.chdir('..')
    
    print(f"Working Directory: {os.getcwd()}")
    
    # Find latest data files - Updated pattern for charging stations
    file_patterns = {
        'vehicles': 'data/raw/epa_vehicles_20250829.csv',
        'stations': 'data/raw/charging_stations_*_20250829.csv',  # Added wildcard for CA
        'sales': 'data/raw/ev_sales_data_20250829.csv'
    }
    
    datasets = {}
    file_info = {}
    
    for name, pattern in file_patterns.items():
        files = sorted(glob.glob(pattern))
        if not files:
            print(f"No {name} files found matching {pattern}")
            continue
            
        latest_file = files[-1]
        file_info[name] = latest_file
        
        try:
            df = pd.read_csv(latest_file)
            if df.empty:
                print(f"{name}: Empty dataset")
                continue
            
            datasets[name] = df
            print(f"{name}: {len(df):,} rows × {len(df.columns)} columns")
            
        except Exception as e:
            print(f"{name}: Loading error - {e}")
    
    return datasets, file_info

# Load datasets
datasets, file_info = load_ev_datasets()
required_datasets = ['vehicles', 'stations', 'sales']
if not all(dataset in datasets for dataset in required_datasets):
    print("\nCritical datasets missing. Please run data collection script first.")
    print("Run: python src/data_collection.py")
else:
    vehicles_df = datasets['vehicles'].copy()
    stations_df = datasets['stations'].copy()
    sales_df = datasets['sales'].copy()
    print(f"\nAll datasets loaded successfully!")

Working Directory: /Users/evanfu/Documents/Personal Projects/EV Performance Analysis/ev-performance-analysis
vehicles: 48 rows × 17 columns
stations: 19,915 rows × 30 columns
sales: 80 rows × 12 columns

All datasets loaded successfully!


In [10]:
def assess_data_quality(df, name):
    """Comprehensive data quality assessment"""
    print(f"\nDATA QUALITY ASSESSMENT: {name.upper()}")
    print("-" * 40)
    
    # Basic info
    print(f"Shape: {df.shape}")
    print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Missing values
    missing = df.isnull().sum()
    if missing.sum() > 0:
        print(f"\nMissing values:")
        for col in missing[missing > 0].index:
            pct = (missing[col] / len(df)) * 100
            print(f"  {col}: {missing[col]:,} ({pct:.1f}%)")
    else:
        print("No missing values")
    
    # Duplicates
    duplicates = df.duplicated().sum()
    print(f"Duplicate rows: {duplicates:,}")
    
    # Numeric columns analysis
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    if len(numeric_cols) > 0:
        print(f"\nNumeric columns: {len(numeric_cols)}")
        for col in numeric_cols:
            if col in df.columns:
                print(f"  {col}: [{df[col].min():.1f}, {df[col].max():.1f}], mean={df[col].mean():.1f}")
    
    # Categorical columns
    categorical_cols = df.select_dtypes(include=['object']).columns
    if len(categorical_cols) > 0:
        print(f"\nCategorical columns: {len(categorical_cols)}")
        for col in categorical_cols[:5]:  # Show first 5
            unique_count = df[col].nunique()
            print(f"  {col}: {unique_count} unique values")

# Assess all datasets
if 'vehicles_df' in globals():
    assess_data_quality(vehicles_df, "EPA Vehicles")
    assess_data_quality(stations_df, "Charging Stations") 
    assess_data_quality(sales_df, "EV Sales")


DATA QUALITY ASSESSMENT: EPA VEHICLES
----------------------------------------
Shape: (48, 17)
Memory usage: 0.02 MB
No missing values
Duplicate rows: 0

Numeric columns: 10
  year: [2019.0, 2024.0], mean=2021.5
  city_mpg: [67.0, 150.0], mean=109.3
  highway_mpg: [61.0, 135.0], mean=98.3
  combined_mpg: [64.0, 142.0], mean=103.8
  range_miles: [237.0, 387.0], mean=327.1
  battery_capacity_kwh: [60.0, 135.0], mean=92.6
  charge_time_240v: [8.3, 18.8], mean=12.9
  msrp_base: [39838.0, 60622.0], mean=49766.9
  co2_emissions: [0.0, 0.0], mean=0.0
  ghg_score: [10.0, 10.0], mean=10.0

Categorical columns: 7
  make: 6 unique values
  model: 8 unique values
  drive_type: 3 unique values
  fuel_type: 1 unique values
  vehicle_class: 2 unique values

DATA QUALITY ASSESSMENT: CHARGING STATIONS
----------------------------------------
Shape: (19915, 30)
Memory usage: 23.46 MB

Missing values:
  street_address: 3 (0.0%)
  facility_type: 15,973 (80.2%)
  connector_types: 3 (0.0%)
  pricing: 17,00