# EV Charging Station EDA - Station 1068

This notebook performs exploratory data analysis on EV charging station data from the UrbanEV dataset.

**Station Focus:** Station 1068 (one of the largest stations in the dataset)

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings

warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 2. Load Data for Station 1068

In [None]:
# Load station information
station_info = pd.read_csv('UrbanEV-main/data/inf.csv')
print("Station Information:")
print(station_info.head())
print(f"\nTotal stations: {len(station_info)}")

In [None]:
# Get information about station 1068
station_1068 = station_info[station_info['TAZID'] == 1068]
print("Station 1068 Details:")
print(station_1068)
print(f"\nNumber of charging piles: {station_1068['charge_count'].values[0]}")

In [None]:
# Load the complete dataset with occupancy, weather, and prices
station_data = pd.read_csv('station_1068_dataset.csv')
station_data['time'] = pd.to_datetime(station_data['time'])
station_data.set_index('time', inplace=True)

print(f"Loaded complete dataset for Station 1068")
print(f"Data shape: {station_data.shape}")
print(f"Date range: {station_data.index.min()} to {station_data.index.max()}")
print(f"\nAvailable columns:")
print(station_data.columns.tolist())

## 3. Data Overview and Structure

In [None]:
# Display first and last few rows
print("First 10 rows:")
print(station_data.head(10))
print("\nLast 10 rows:")
print(station_data.tail(10))

In [None]:
# Basic information
print("Dataset Info:")
print(station_data.info())
print(f"\nTotal records: {len(station_data)}")
print(f"Data type: {station_data['occupancy'].dtype}")

## 4. Missing Values Analysis

In [None]:
# Check for missing values
missing_count = station_data['occupancy'].isna().sum()
missing_pct = (missing_count / len(station_data)) * 100

print(f"Missing values: {missing_count} ({missing_pct:.2f}%)")

# Visualize missing values over time
fig, ax = plt.subplots(figsize=(14, 3))
missing_mask = station_data['occupancy'].isna()
ax.scatter(station_data.index[missing_mask], 
           np.ones(missing_mask.sum()), 
           c='red', marker='|', s=100, label='Missing')
ax.scatter(station_data.index[~missing_mask], 
           np.ones((~missing_mask).sum()), 
           c='green', marker='|', s=100, alpha=0.1, label='Present')
ax.set_yticks([])
ax.set_xlabel('Time')
ax.set_title('Missing Values Distribution Over Time')
ax.legend()
plt.tight_layout()
plt.show()

## 5. Statistical Summary

In [None]:
# Descriptive statistics
print("Descriptive Statistics:")
print(station_data.describe())

# Additional statistics
print(f"\nAdditional Statistics:")
print(f"Range: {station_data['occupancy'].min()} - {station_data['occupancy'].max()}")
print(f"Variance: {station_data['occupancy'].var():.2f}")
print(f"Skewness: {station_data['occupancy'].skew():.2f}")
print(f"Kurtosis: {station_data['occupancy'].kurtosis():.2f}")

## 6. Time Series Visualization

In [None]:
# Plot complete time series with occupancy and temperature
fig, ax1 = plt.subplots(figsize=(15, 6))

# Plot occupancy on left y-axis
color1 = 'steelblue'
ax1.plot(station_data.index, station_data['occupancy'], linewidth=0.8, alpha=0.7, color=color1, label='Occupancy')
ax1.set_xlabel('Time')
ax1.set_ylabel('Occupancy (vehicles)', color=color1)
ax1.tick_params(axis='y', labelcolor=color1)
ax1.grid(True, alpha=0.3)

# Create second y-axis for temperature
ax2 = ax1.twinx()
color2 = 'coral'
ax2.plot(station_data.index, station_data['T'], linewidth=0.8, alpha=0.7, color=color2, label='Temperature')
ax2.set_ylabel('Temperature (¬∞C)', color=color2)
ax2.tick_params(axis='y', labelcolor=color2)

plt.title('Station 1068 - Occupancy and Temperature Over Time')
fig.legend(loc='upper right', bbox_to_anchor=(0.98, 0.98))
plt.tight_layout()
plt.show()

In [None]:
# Zoom into first week
first_week = station_data.iloc[:24*7]  # First 7 days (hourly data)

fig, ax = plt.subplots(figsize=(15, 5))
ax.plot(first_week.index, first_week['occupancy'], marker='o', linewidth=2)
ax.set_xlabel('Time')
ax.set_ylabel('Occupancy')
ax.set_title('Station 1068 - First Week Occupancy Pattern')
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 7. Distribution Analysis

In [None]:
# Histogram and KDE
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram
axes[0].hist(station_data['occupancy'].dropna(), bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Occupancy')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Occupancy Distribution - Histogram')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(station_data['occupancy'].dropna(), vert=True)
axes[1].set_ylabel('Occupancy')
axes[1].set_title('Occupancy Distribution - Box Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Identify outliers using IQR method
Q1 = station_data['occupancy'].quantile(0.25)
Q3 = station_data['occupancy'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = station_data[(station_data['occupancy'] < lower_bound) | 
                        (station_data['occupancy'] > upper_bound)]

print(f"Outliers detected: {len(outliers)} ({len(outliers)/len(station_data)*100:.2f}%)")
print(f"Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")

## 8. Temporal Pattern Analysis

In [None]:
# Extract temporal features
station_data['hour'] = station_data.index.hour
station_data['day_of_week'] = station_data.index.dayofweek
station_data['day_name'] = station_data.index.day_name()
station_data['month'] = station_data.index.month

# Hourly pattern
hourly_avg = station_data.groupby('hour')['occupancy'].mean()

fig, ax = plt.subplots(figsize=(12, 5))
hourly_avg.plot(kind='bar', ax=ax, color='steelblue', edgecolor='black')
ax.set_xlabel('Hour of Day')
ax.set_ylabel('Average Occupancy')
ax.set_title('Average Occupancy by Hour of Day')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Daily pattern
day_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_avg = station_data.groupby('day_of_week')['occupancy'].mean()

fig, ax = plt.subplots(figsize=(12, 5))
daily_avg.plot(kind='bar', ax=ax, color='coral', edgecolor='black')
ax.set_xticklabels(day_names, rotation=45)
ax.set_xlabel('Day of Week')
ax.set_ylabel('Average Occupancy')
ax.set_title('Average Occupancy by Day of Week')
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

In [None]:
# Heatmap: Hour vs Day of Week
pivot_table = station_data.pivot_table(values='occupancy', 
                                        index='hour', 
                                        columns='day_of_week', 
                                        aggfunc='mean')

fig, ax = plt.subplots(figsize=(12, 8))
sns.heatmap(pivot_table, cmap='YlOrRd', annot=False, fmt='.1f', ax=ax, cbar_kws={'label': 'Occupancy'})
ax.set_xlabel('Day of Week')
ax.set_ylabel('Hour of Day')
ax.set_xticklabels(day_names, rotation=45)
ax.set_title('Occupancy Heatmap: Hour vs Day of Week')
plt.tight_layout()
plt.show()

## 9. Summary and Key Findings

Key insights from the exploratory analysis:

In [None]:
# Summary statistics
print("=" * 60)
print("STATION 1068 - EDA SUMMARY")
print("=" * 60)
print(f"\nüìç Station Information:")
print(f"   Location: ({station_1068['longitude'].values[0]:.4f}, {station_1068['latitude'].values[0]:.4f})")
print(f"   Charging piles: {station_1068['charge_count'].values[0]}")
print(f"   Area: {station_1068['area'].values[0]:.2f} m¬≤")

print(f"\nüìä Dataset Overview:")
print(f"   Total records: {len(station_data)}")
print(f"   Date range: {station_data.index.min().strftime('%Y-%m-%d')} to {station_data.index.max().strftime('%Y-%m-%d')}")
print(f"   Missing values: {missing_count} ({missing_pct:.2f}%)")

print(f"\nüìà Occupancy Statistics:")
print(f"   Mean: {station_data['occupancy'].mean():.2f}")
print(f"   Median: {station_data['occupancy'].median():.2f}")
print(f"   Std Dev: {station_data['occupancy'].std():.2f}")
print(f"   Min: {station_data['occupancy'].min():.2f}")
print(f"   Max: {station_data['occupancy'].max():.2f}")

print(f"\n‚è∞ Peak Usage:")
peak_hour = hourly_avg.idxmax()
peak_day = daily_avg.idxmax()
print(f"   Peak hour: {peak_hour}:00 (avg: {hourly_avg[peak_hour]:.2f})")
print(f"   Peak day: {day_names[peak_day]} (avg: {daily_avg[peak_day]:.2f})")

print(f"\nüîª Lowest Usage:")
low_hour = hourly_avg.idxmin()
low_day = daily_avg.idxmin()
print(f"   Lowest hour: {low_hour}:00 (avg: {hourly_avg[low_hour]:.2f})")
print(f"   Lowest day: {day_names[low_day]} (avg: {daily_avg[low_day]:.2f})")

print("=" * 60)