## Generate Data Sets

In [2]:
import pandas as pd
import numpy as np

### Waste Collection Points Data Set

In [9]:
num_points = 50

waste_collection_points = pd.DataFrame({
    'Location ID': np.arange(1, num_points + 1),
    'Coordinates (Lat, Long)': [(np.random.uniform(-90, 90), np.random.uniform(-180, 180)) for _ in range(num_points)],
    'Waste Type': np.random.choice(['Organic', 'Recyclable', 'Non-recyclable'], num_points),
    'Waste Volume (cu m)': np.random.uniform(1, 10, num_points),
    'Collection Frequency': np.random.choice(['Weekly', 'Daily', 'Twice a week'], num_points)
})

waste_collection_points.to_csv('waste_collection_points.csv', index=False)


In [10]:
# Load the dataset
df = pd.read_csv('waste_collection_points.csv')

# Display the first few rows to understand the general structure
print("First 5 rows of the dataset:")
print(df.head())

# Dataset structure: Column names and data types
print("\nDataset Structure (Columns and Data Types):")
print(df.dtypes)

# Basic summary statistics for numeric columns
print("\nSummary Statistics for Numeric Columns:")
print(df.describe())

# Count of unique values in each categorical column
print("\nCount of Unique Values in Categorical Columns:")
for col in ['Waste Type', 'Collection Frequency']:
    print(f"{col}: {df[col].nunique()} unique values")

# Frequency distribution of the 'Waste Type' and 'Collection Frequency' columns
print("\nFrequency Distribution for 'Waste Type':")
print(df['Waste Type'].value_counts())

print("\nFrequency Distribution for 'Collection Frequency':")
print(df['Collection Frequency'].value_counts())


First 5 rows of the dataset:
   Location ID                    Coordinates (Lat, Long)      Waste Type  \
0            1    (-64.61539220596146, 174.7381010662496)         Organic   
1            2  (-49.083046112939314, -20.19466790822341)      Recyclable   
2            3   (81.93321909071958, -141.21443714276276)         Organic   
3            4     (-66.38915312099195, -64.791730564401)  Non-recyclable   
4            5   (89.60542553890991, -155.04595036428202)      Recyclable   

   Waste Volume (cu m) Collection Frequency  
0             1.178636         Twice a week  
1             6.377859         Twice a week  
2             8.859833         Twice a week  
3             4.861165               Weekly  
4             2.254312         Twice a week  

Dataset Structure (Columns and Data Types):
Location ID                  int64
Coordinates (Lat, Long)     object
Waste Type                  object
Waste Volume (cu m)        float64
Collection Frequency        object
dtype: objec

### Urban Road Network Data Set

In [5]:
# Number of nodes and edges in the synthetic road network
num_nodes = 100
num_edges = 150

# Generate nodes
nodes = np.arange(1, num_nodes + 1)

# Generate edges
edges = pd.DataFrame({
    'Node ID': np.random.choice(nodes, num_edges),
    'Edge ID': ['E' + str(i) for i in range(1, num_edges + 1)],
    'Edge Weight (km)': np.random.uniform(0.5, 10.0, num_edges),
    'Connectivity (Node IDs)': [f"{np.random.randint(1, num_nodes)},{np.random.randint(1, num_nodes)}" for _ in range(num_edges)]
})

# Save to CSV
edges.to_csv('urban_road_network.csv', index=False)

In [6]:
# Load the urban road network dataset
df = pd.read_csv('urban_road_network.csv')

# Display the first few rows to get an overview of the data
print("First 5 rows of the dataset:")
print(df.head(), '\n')

# Dataset structure: Column names and data types
print("Dataset Structure (Columns and Data Types):")
print(df.dtypes, '\n')

# Basic summary statistics for the 'Edge Weight (km)' column
print("Summary Statistics for 'Edge Weight (km)':")
print(df['Edge Weight (km)'].describe(), '\n')

# Explore the connectivity patterns
# Note: This is a simplistic analysis based on the assumption of sequential connectivity
print("Connectivity Patterns (Example Rows):")
print(df['Connectivity (Node IDs)'].head(), '\n')

# Count unique nodes and edges
unique_nodes = pd.concat([df['Node ID'], df['Connectivity (Node IDs)'].str.split(',', expand=True)[1].astype(int)]).unique()
print(f"Number of Unique Nodes: {len(unique_nodes)}")
unique_edges = df['Edge ID'].nunique()
print(f"Number of Unique Edges: {unique_edges}", '\n')

# Explore distribution of edge weights
print("Edge Weight Distribution (First 10 Bins):")
print(df['Edge Weight (km)'].value_counts(bins=10, sort=False).head(10))


First 5 rows of the dataset:
   Node ID Edge ID  Edge Weight (km) Connectivity (Node IDs)
0       14      E1          7.794422                   90,84
1       76      E2          5.757704                   12,71
2       42      E3          1.311044                   70,73
3       45      E4          5.526044                   43,32
4       82      E5          2.723888                   81,73 

Dataset Structure (Columns and Data Types):
Node ID                      int64
Edge ID                     object
Edge Weight (km)           float64
Connectivity (Node IDs)     object
dtype: object 

Summary Statistics for 'Edge Weight (km)':
count    150.000000
mean       5.382109
std        2.702472
min        0.520579
25%        3.384199
50%        5.246175
75%        7.747274
max        9.923500
Name: Edge Weight (km), dtype: float64 

Connectivity Patterns (Example Rows):
0    90,84
1    12,71
2    70,73
3    43,32
4    81,73
Name: Connectivity (Node IDs), dtype: object 

Number of Unique No

### Traffic Data Set

In [7]:
# Assume one timestamp for simplicity, but you could generate more
timestamps = ['2023-01-01 00:00:00']

traffic_data = pd.DataFrame({
    'Edge ID': edges['Edge ID'],
    'Time Stamp': np.random.choice(timestamps, num_edges),
    'Traffic Speed (km/h)': np.random.uniform(20, 100, num_edges),
    'Traffic Density (vehicles/km)': np.random.uniform(10, 200, num_edges)
})

traffic_data.to_csv('traffic_data.csv', index=False)


In [8]:
# Load the traffic data dataset
df_traffic = pd.read_csv('traffic_data.csv')

# Display the first few rows to get an overview of the data
print("First 5 rows of the dataset:")
print(df_traffic.head(), '\n')

# Dataset structure: Column names and data types
print("Dataset Structure (Columns and Data Types):")
print(df_traffic.dtypes, '\n')

# Basic summary statistics for numeric columns
print("Summary Statistics for Numeric Columns:")
print(df_traffic.describe(), '\n')

# Explore the range of timestamps to understand the period covered
print("Timestamp Range:")
print(f"Start: {df_traffic['Time Stamp'].min()}, End: {df_traffic['Time Stamp'].max()}", '\n')

# Analyze traffic speed and density
# You could also consider visualizations here for a more in-depth analysis
print("Traffic Speed Insights:")
print(f"Average speed: {df_traffic['Traffic Speed (km/h)'].mean()} km/h")
print(f"Max speed: {df_traffic['Traffic Speed (km/h)'].max()} km/h", '\n')

print("Traffic Density Insights:")
print(f"Average density: {df_traffic['Traffic Density (vehicles/km)'].mean()} vehicles/km")
print(f"Max density: {df_traffic['Traffic Density (vehicles/km)'].max()} vehicles/km")


First 5 rows of the dataset:
  Edge ID           Time Stamp  Traffic Speed (km/h)  \
0      E1  2023-01-01 00:00:00             69.423827   
1      E2  2023-01-01 00:00:00             48.107733   
2      E3  2023-01-01 00:00:00             90.822712   
3      E4  2023-01-01 00:00:00             40.064544   
4      E5  2023-01-01 00:00:00             66.546414   

   Traffic Density (vehicles/km)  
0                     120.738822  
1                     146.612850  
2                     132.519272  
3                      86.126774  
4                      92.192438   

Dataset Structure (Columns and Data Types):
Edge ID                           object
Time Stamp                        object
Traffic Speed (km/h)             float64
Traffic Density (vehicles/km)    float64
dtype: object 

Summary Statistics for Numeric Columns:
       Traffic Speed (km/h)  Traffic Density (vehicles/km)
count            150.000000                     150.000000
mean              61.526312             