<a href="https://colab.research.google.com/github/cosmf/-QR-Factorization-optimization/blob/main/Veveritele_salbatice_aad.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup & Data Exploration

In [3]:
# Step 0: Setup - Mount Google Drive and Configure Kaggle API
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

# Set up Kaggle API credentials
!mkdir -p ~/.kaggle
!cp '/content/drive/My Drive/kaggle/kaggle.json' ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

# Step 1: Download the Urban Traffic Density dataset
!kaggle datasets download -d tanishqdublish/urban-traffic-density-in-cities -p /content/urban-traffic-data

# Step 2: Unzip the downloaded files
!unzip -q /content/urban-traffic-data/urban-traffic-density-in-cities.zip -d /content/urban-traffic-data

# Step 3: Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 4: Load the dataset
# Update the filename below as needed – check the contents of /content/urban-traffic-data/ for the correct CSV file name.
df = pd.read_csv("/content/urban-traffic-data/Urban Traffic Density in Cities.csv")


# Quick exploration of the dataset
print("First 5 records:")
print(df.head())

print("\nDataframe Info:")
print(df.info())

print("\nDescriptive Statistics:")
print(df.describe())

Mounted at /content/drive
Dataset URL: https://www.kaggle.com/datasets/tanishqdublish/urban-traffic-density-in-cities
License(s): MIT
urban-traffic-density-in-cities.zip: Skipping, found more recently modified local copy (use --force to force download)
replace /content/urban-traffic-data/futuristic_city_traffic.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

FileNotFoundError: [Errno 2] No such file or directory: '/content/urban-traffic-data/Urban Traffic Density in Cities.csv'

Data Cleaning & Preprocessing

In [None]:
# Identify missing values in each column
print("\nMissing values in each column:")
print(df.isnull().sum())

# Example: Fill missing numeric values (e.g., 'speed') with the median if needed
if 'speed' in df.columns:
    median_speed = df['speed'].median()
    df['speed'].fillna(median_speed, inplace=True)

# Check for duplicates and remove them
num_duplicates = df.duplicated().sum()
print("\nNumber of duplicate records:", num_duplicates)
df.drop_duplicates(inplace=True)

# Convert date/time column to datetime type (assuming the column is named 'time')
if 'time' in df.columns:
    df['time'] = pd.to_datetime(df['time'], errors='coerce')

# Convert numeric columns explicitly (e.g., ensure 'speed' is numeric)
if 'speed' in df.columns:
    df['speed'] = pd.to_numeric(df['speed'], errors='coerce')

Dealing with Outliers

In [None]:
# Removing outliers for the 'speed' column using the IQR method
if 'speed' in df.columns:
    Q1 = df['speed'].quantile(0.25)
    Q3 = df['speed'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    print(f"\nSpeed lower bound: {lower_bound}, upper bound: {upper_bound}")

    # Filter the dataset to only include values within the bounds
    df = df[(df['speed'] >= lower_bound) & (df['speed'] <= upper_bound)]


Filtering & Sorting Data

In [None]:
# Filtering example: records for rainy weather (make sure the 'weather' column exists)
if 'weather' in df.columns:
    df_rainy = df[df['weather'].str.lower() == 'rainy']
    print("\nFirst few records for rainy weather:")
    print(df_rainy.head())

# Filtering example: records where speed is greater than 50 (adjust threshold if needed)
if 'speed' in df.columns:
    df_speeding = df[df['speed'] > 50]
    print("\nRecords with speed > 50:")
    print(df_speeding.head())

# Sorting the dataset by 'speed' in descending order
if 'speed' in df.columns:
    df_sorted = df.sort_values(by='speed', ascending=False)
    print("\nTop records sorted by speed (descending):")
    print(df_sorted.head())


Grouping & Aggregation

In [None]:
# Grouping by weather condition to calculate the average speed
if 'weather' in df.columns and 'speed' in df.columns:
    avg_speed_weather = df.groupby('weather')['speed'].mean()
    print("\nAverage speed by weather condition:")
    print(avg_speed_weather)

# Grouping by hour to analyze traffic volume (requires that 'time' is a datetime column)
if 'time' in df.columns:
    traffic_volume_time = df.groupby(df['time'].dt.hour).size()
    print("\nTraffic volume by hour:")
    print(traffic_volume_time)

# More detailed aggregation: min, max, mean speeds by weather condition
if 'weather' in df.columns and 'speed' in df.columns:
    summary = df.groupby('weather').agg({'speed': ['mean', 'max', 'min']})
    print("\nSpeed summary (mean, max, min) by weather condition:")
    print(summary)


Exploratory Data Analysis (EDA) & Visualization

In [None]:
# Visualization 1: Distribution of speeds
if 'speed' in df.columns:
    plt.figure(figsize=(8, 5))
    df['speed'].hist(bins=30)
    plt.title("Speed Distribution")
    plt.xlabel("Speed")
    plt.ylabel("Frequency")
    plt.show()

# Visualization 2: Boxplot of speed by weather condition
if 'weather' in df.columns and 'speed' in df.columns:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x='weather', y='speed', data=df)
    plt.title("Speed by Weather Condition")
    plt.show()

# Visualization 3: Traffic volume by hour (time-series analysis)
if 'time' in df.columns:
    traffic_hour = df.groupby(df['time'].dt.hour).size()
    plt.figure(figsize=(8, 5))
    traffic_hour.plot(kind='line')
    plt.title("Traffic Volume by Hour")
    plt.xlabel("Hour")
    plt.ylabel("Number of Records")
    plt.show()

# Visualization 4: Correlation heatmap (only numerical features)
numeric_cols = df.select_dtypes(include=[np.number])
plt.figure(figsize=(8, 5))
correlation = numeric_cols.corr()
sns.heatmap(correlation, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()
