In [None]:
import pandas as pd

# Load the dataset
dataset_path = 'unnati_phase1_data_revised.csv'
df = pd.read_csv(dataset_path)

# Show the first few rows of the dataset
df.head()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Initialize the figure
plt.figure(figsize=(20, 10))

# Create subplots
plt.subplot(1, 2, 1)
sns.scatterplot(data=df, x='Long', y='Lat', hue='Alert', alpha=0.6, edgecolor=None)
plt.title('Geographical Distribution of Alerts')
plt.xlabel('Longitude')
plt.ylabel('Latitude')

# Create density plot
plt.subplot(1, 2, 2)
sns.kdeplot(data=df, x='Long', y='Lat', hue='Alert', fill=True)
plt.title('Density Plot of Geographical Distribution of Alerts')
plt.xlabel('Longitude')
plt.ylabel('Latitude')


plt.tight_layout()
plt.show()


Geospatial Analysis Summary
Geographical Distribution of Alerts: The scatter plot on the left shows the geographical distribution of alerts based on latitude and longitude. Different colors represent different types of alerts.

Density Plot: The density plot on the right provides a heat map-like representation to show concentrations of alerts. Areas with darker shades indicate a higher concentration of alerts.

# Anomaly Detection


In [None]:
# Import required libraries for statistical analysis
from scipy import stats
import numpy as np

# Calculate z-scores for the 'Speed' column
df['Speed_zscore'] = np.abs(stats.zscore(df['Speed']))

# Define a threshold for outliers (e.g., a z-score greater than 3 indicates an outlier)
threshold = 3

# Extract outliers based on the z-score threshold
speed_outliers = df[df['Speed_zscore'] > threshold]

# Initialize the figure
plt.figure(figsize=(12, 6))

# Create the boxplot to visualize outliers
sns.boxplot(x=df['Speed'])
plt.title('Boxplot of Speed to Identify Outliers')
plt.xlabel('Speed')

plt.tight_layout()
plt.show()

# Show a few outliers
speed_outliers.head()


No outliers

Anomaly Detection: Alert Frequency

In [None]:
# Count the frequency of each type of alert
alert_frequency = df['Alert'].value_counts().reset_index()
alert_frequency.columns = ['Alert', 'Frequency']

# Define a frequency threshold for potential anomalies (e.g., alerts that occur less than a certain number of times)
frequency_threshold = 5

# Identify potential anomalies based on the frequency threshold
alert_anomalies = alert_frequency[alert_frequency['Frequency'] < frequency_threshold]

# Plot the frequency of each type of alert
plt.figure(figsize=(12, 6))
sns.barplot(data=alert_frequency, x='Alert', y='Frequency', palette='viridis')
plt.title('Frequency of Different Types of Alerts')
plt.xlabel('Alert Type')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

# Show potential anomalies
alert_anomalies


Anomaly Detection: Vehicle-Specific Anomalies

In [None]:
# Count the number of alerts generated by each vehicle
vehicle_frequency = df['Vehicle'].value_counts().reset_index()
vehicle_frequency.columns = ['Vehicle', 'Frequency']

# Define frequency thresholds for potential anomalies (e.g., vehicles that generate alerts less/more than a certain number of times)
low_frequency_threshold = 5
high_frequency_threshold = vehicle_frequency['Frequency'].quantile(0.95)  # 95th percentile as the high threshold

# Identify potential anomalies based on the frequency thresholds
low_frequency_anomalies = vehicle_frequency[vehicle_frequency['Frequency'] < low_frequency_threshold]
high_frequency_anomalies = vehicle_frequency[vehicle_frequency['Frequency'] > high_frequency_threshold]

# Plot the frequency of alerts for each vehicle
plt.figure(figsize=(15, 6))
sns.histplot(vehicle_frequency['Frequency'], bins=30, kde=False)
plt.title('Frequency of Alerts by Vehicle')
plt.xlabel('Number of Alerts')
plt.ylabel('Number of Vehicles')
plt.axvline(low_frequency_threshold, color='r', linestyle='--', label=f'Low Frequency Threshold ({low_frequency_threshold})')
plt.axvline(high_frequency_threshold, color='g', linestyle='--', label=f'High Frequency Threshold ({int(high_frequency_threshold)})')
plt.legend()

plt.tight_layout()
plt.show()

# Show potential anomalies
low_frequency_anomalies, high_frequency_anomalies


Anomaly Detection Summary: Vehicle-Specific Anomalies
Frequency of Alerts by Vehicle: The histogram shows the distribution of the number of alerts generated by each vehicle. The red and green dashed lines represent the low and high frequency thresholds, respectively.

Low-Frequency Anomalies: Based on the low frequency threshold of 5, we didn't find any vehicles that generate alerts fewer than this number, indicating no anomalies in this aspect.

High-Frequency Anomalies: Using the 95th percentile as the high frequency threshold, we identified one vehicle (Vehicle ID 805) that generated a significantly high number of alerts (6875), which could be considered anomalous.

In [None]:
# Count the number of alerts generated by Vehicle ID 1995
vehicle_1995_frequency = df[df['Vehicle'] == 1995].shape[0]

vehicle_1995_frequency

Vehicle ID 1995 has generated a total of 18 alerts, which is above the low-frequency threshold of 5 that we defined earlier. Therefore, it does not qualify as a low-frequency anomaly based on this criterion.

# Time Series Decomposition
Time series decomposition involves breaking a time series down into its constituent components, which typically include the trend, seasonality, and residual (noise). This helps us understand the underlying patterns in the data.

To perform time series decomposition, we'll:

Aggregate the data by date to get the total number of alerts per day.
Decompose the aggregated time series into trend, seasonal, and residual components.

In [None]:
# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Aggregate the data by date to get the total number of alerts per day
time_series_data = df.groupby('Date').size().reset_index(name='Count')

# Display the first few rows of the aggregated data
time_series_data.head()


In [None]:
! pip install statsmodels
from statsmodels.tsa.seasonal import seasonal_decompose


In [None]:
# Perform seasonal decomposition using the 'period' parameter for weekly frequency
decomposition = seasonal_decompose(time_series_data['Count'], model='additive', period=7)  # Weekly frequency

# Plot the original data, the trend, the seasonality, and the residuals
plt.figure(figsize=(16, 8))

plt.subplot(4, 1, 1)
plt.plot(decomposition.observed)
plt.title('Observed')

plt.subplot(4, 1, 2)
plt.plot(decomposition.trend)
plt.title('Trend')

plt.subplot(4, 1, 3)
plt.plot(decomposition.seasonal)
plt.title('Seasonal')

plt.subplot(4, 1, 4)
plt.plot(decomposition.resid)
plt.title('Residual')

plt.tight_layout()
plt.show()


Time Series Decomposition Summary
Observed: The first plot shows the original time series data, which is the total number of alerts per day.

Trend: The second plot shows the underlying trend in the data. It appears that there might be a general upward trend, although this should be confirmed with a longer time series for more accurate results.

Seasonal: The third plot represents the seasonal component, which indicates that there's a weekly pattern in the data. The seasonality suggests that certain days of the week may have higher or lower numbers of alerts.

Residual: The fourth plot shows the residual component, which is what remains after the trend and seasonal components have been removed. This can be useful for identifying anomalies or other patterns not captured by the trend and seasonal components.

Components of Time Series Decomposition
Trend: The trend represents the overall direction in which the data is moving. In the context of the alert data, the trend could tell us whether the number of alerts is generally increasing, decreasing, or staying the same over time.

Seasonal: This component captures the regular pattern of peaks and troughs in the time series data at fixed intervals (like daily, weekly, monthly, etc.). For instance, in our analysis, we noticed some weekly seasonality, meaning the number of alerts varies in a regular pattern over the course of a week.

Residual: The residual is what's left after the trend and seasonal components have been removed. It represents the "noise" or random variation in the data. Analyzing the residuals can sometimes reveal additional patterns or anomalies that are not captured by the trend and seasonal components.

Our Findings in the Context of the Dataset
Observed: We started by looking at the raw time series data, which was the total count of alerts for each day.

Trend: We observed a possible upward trend, suggesting that the number of alerts might be increasing over time. However, this would need to be confirmed with a longer dataset for more robust insights.

Seasonal: Our analysis indicated a weekly seasonality in the data. This could mean that certain days of the week have higher or lower numbers of alerts, which could be influenced by factors like workdays vs. weekends, traffic patterns, etc.

Residual: The residual plot showed what's left after removing the trend and seasonal components. This can be useful for spotting anomalies or other patterns not captured by the other two components.

# Speed Analysis

In [None]:
# Initialize the figure
plt.figure(figsize=(16, 8))

# Create a boxplot to examine the distribution of speed for each type of alert
sns.boxplot(data=df, x='Alert', y='Speed')
plt.title('Distribution of Speed for Each Type of Alert')
plt.xlabel('Alert Type')
plt.ylabel('Speed')

plt.tight_layout()
plt.show()


Speed Analysis Summary: Distribution of Speed for Each Type of Alert
The boxplot shows the distribution of speed for each type of alert. The central line in each box represents the median speed, while the top and bottom of the boxes represent the third and first quartiles, respectively. The whiskers extend to 1.5 times the interquartile range, and points beyond that could be considered outliers.

From the plot, we can observe:

Variability in speed distribution across different types of alerts.
Some types of alerts occur more frequently at higher speeds, while others seem to be more common at lower speeds.

In [None]:
# Calculate the correlation matrix for the numerical variables in the dataset
correlation_matrix = df[['Lat', 'Long', 'Vehicle', 'Speed']].corr()

# Initialize the figure
plt.figure(figsize=(10, 6))

# Create a heatmap to visualize the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')

plt.tight_layout()
plt.show()

### Correlation Analysis Summary
The heatmap shows the correlation matrix of the numerical features:

Latitude (Lat) and Longitude (Long): These two features have a correlation coefficient of 0.25, indicating a weak positive correlation. This is expected as latitude and longitude are geographical coordinates and usually don't have a strong correlation in a well-distributed dataset.

Vehicle ID (Vehicle) and Speed (Speed): The correlation between these two features is close to zero, indicating no significant relationship.

Latitude (Lat) and Speed (Speed): The correlation between these two features is also close to zero, suggesting no significant relationship.

Longitude (Long) and Speed (Speed): These two features have a correlation coefficient of -0.06, indicating a very weak negative correlation.

Overall, the numerical features in the dataset do not show strong correlations with each other, suggesting they are relatively independent.

## Advanced Visualizations

In [None]:
import folium
from folium.plugins import HeatMap

# Filter out rows with missing or zero lat/long values
geo_data = df[(df['Lat'] != 0) & (df['Long'] != 0)][['Lat', 'Long', 'Alert']]

# Initialize a basic map centered around the mean latitude and longitude
m = folium.Map(location=[geo_data['Lat'].mean(), geo_data['Long'].mean()], zoom_start=10)

# Create a heatmap
heat_data = [[row['Lat'], row['Long']] for index, row in geo_data.iterrows()]
HeatMap(heat_data).add_to(m)

# Save the heatmap as an HTML file
heatmap_file_path = 'heatmap.html'
m.save(heatmap_file_path)

heatmap_file_path


Open it in a web browser to interact with the heatmap and explore the geospatial distribution of alerts.



## Interactive Plot for Time Series Data

In [None]:
! pip install plotly

In [None]:

import plotly.express as px

# Reload the dataset
df = pd.read_csv(dataset_path)

# Convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Aggregate the data by date to get the total number of alerts per day
time_series_data = df.groupby('Date').size().reset_index(name='Count')

# Create an interactive time series plot using Plotly
fig = px.line(time_series_data, x='Date', y='Count', title='Interactive Time Series Plot of Daily Alerts')

# Save the interactive plot as an HTML file
interactive_plot_file_path = 'interactive_time_series_plot.html'
fig.write_html(interactive_plot_file_path)

interactive_plot_file_path


For the time series data, an interactive plot could allow you to:

Zoom in and out to explore different time scales.
Hover over data points to see exact values.
Turn on/off different data series for better visualization.

In [None]:
# Create an interactive bar plot for the frequency of each type of alert
alert_frequency_data = df['Alert'].value_counts().reset_index()
alert_frequency_data.columns = ['Alert', 'Frequency']

fig = px.bar(alert_frequency_data, x='Alert', y='Frequency', title='Interactive Bar Plot of Alert Frequencies')

# Save the interactive bar plot as an HTML file
interactive_bar_plot_file_path = 'interactive_bar_plot.html'
fig.write_html(interactive_bar_plot_file_path)

interactive_bar_plot_file_path

In [None]:
# Create a comprehensive interactive scatter plot using Plotly
fig = px.scatter(df, x='Speed', y='Lat', color='Alert', hover_data=['Long', 'Vehicle'],
                 title='Comprehensive Interactive Scatter Plot',
                 labels={'Lat': 'Latitude', 'Speed': 'Speed', 'Alert': 'Alert Type'},
                 color_continuous_scale='Viridis')

# Add interactive functionalities like zoom, pan, and hover
fig.update_layout(hovermode='closest')

# Save the interactive scatter plot as an HTML file
interactive_scatter_plot_file_path = 'interactive_scatter_plot.html'
fig.write_html(interactive_scatter_plot_file_path)

interactive_scatter_plot_file_path


## 3D Scatter Plot

In [None]:
# Create a comprehensive 3D scatter plot using Plotly
fig = px.scatter_3d(df, x='Lat', y='Long', z='Speed', color='Alert',
                     title='Comprehensive 3D Scatter Plot: Speed vs Latitude vs Longitude',
                     labels={'Lat': 'Latitude', 'Long': 'Longitude', 'Speed': 'Speed', 'Alert': 'Alert Type'},
                     color_continuous_scale='Viridis')

# Save the 3D scatter plot as an HTML file
interactive_3d_scatter_plot_file_path = 'interactive_3d_scatter_plot.html'
fig.write_html(interactive_3d_scatter_plot_file_path)

interactive_3d_scatter_plot_file_path


Pair Plots

In [None]:
# Randomly sample a subset of the dataset for efficient plotting
sample_data = df.sample(frac=0.1, random_state=1)

# Create the Pair Plot
sns.pairplot(sample_data, hue='Alert', vars=['Speed', 'Lat', 'Long'], diag_kind='kde', plot_kws={'alpha': 0.5})
plt.suptitle('Pair Plots: Pairwise Relationships Between Numerical Variables', y=1.02)
plt.show()


The pair plot shows pairwise relationships between the numerical variables Speed, Lat, and Long, colored by the type of Alert. Each small plot is a scatter plot of two variables, allowing us to see how they interact. The diagonal shows the Kernel Density Estimation (KDE) for each variable, broken down by alert type.

Please note that we used a random subset of the data for efficiency, so the plot is an approximation. Nonetheless, it provides a broad view of the relationships among these variables and how they differ for various alert types.

In [None]:
# Extract the weekday from the Date column and add it to a new column in the dataframe
df['Weekday'] = df['Date'].dt.day_name()

# Create violin plots to visualize the distribution of speed by alert type and weekday
plt.figure(figsize=(18, 10))
sns.violinplot(x='Alert', y='Speed', hue='Weekday', data=df, inner='quartile', palette='pastel')
plt.title('Violin Plots: Speed Distribution by Alert Type and Weekday')
plt.xlabel('Alert Type')
plt.ylabel('Speed')

plt.tight_layout()
plt.show()


The violin plot provides an advanced visualization of the distribution of speed for each type of alert, broken down by weekdays. Each "violin" represents the distribution of speeds for a particular alert type, and the different colors within each violin represent different weekdays.

Here's how to interpret the plot:

Width of the Violin: A wider section represents a higher density of data points, meaning that speeds in that range are more common for that alert type and weekday.

Internal Lines: These represent quartiles of the data, providing a more detailed view of the distribution.

Colors: Each color represents a different weekday, allowing us to see if the distribution of speeds varies based on the day of the week.

In [None]:
# Aggregate data to compute the frequency of each alert type at each location and the average speed
bubble_data = df.groupby(['Lat', 'Long', 'Alert']).agg({'Speed': 'mean', 'Date': 'count'}).reset_index()
bubble_data.columns = ['Lat', 'Long', 'Alert', 'Avg_Speed', 'Frequency']

# Create an interactive bubble plot
fig = px.scatter(bubble_data, x='Lat', y='Long', color='Avg_Speed', size='Frequency', hover_data=['Alert'],
                 labels={'Lat': 'Latitude', 'Long': 'Longitude', 'Avg_Speed': 'Average Speed', 'Frequency': 'Frequency'},
                 title='Interactive Bubble Plot: Frequency and Average Speed of Alerts by Location')

# Save the interactive bubble plot as an HTML file
interactive_bubble_plot_file_path = 'interactive_bubble_plot.html'
fig.write_html(interactive_bubble_plot_file_path)

interactive_bubble_plot_file_path


In [None]:
# Aggregate the data by date and alert type to get the total number of alerts for each combination
animated_data = df.groupby(['Date', 'Alert']).size().reset_index(name='Count')

# Create the animated scatter plot
fig = px.scatter(animated_data, x='Alert', y='Count', animation_frame='Date', size='Count',
                 labels={'Alert': 'Alert Type', 'Count': 'Number of Alerts', 'Date': 'Date'},
                 title='Animated Scatter Plot: Change in Number of Alerts Over Time')

# Save the animated scatter plot as an HTML file
animated_scatter_plot_file_path = 'animated_scatter_plot.html'
fig.write_html(animated_scatter_plot_file_path)

animated_scatter_plot_file_path


In [None]:
# Import necessary libraries for Plotly Express
import plotly.express as px

# Filter out rows with missing or zero lat/long values for better visualization
map_data = df[(df['Lat'] != 0) & (df['Long'] != 0)].copy()

# Convert the 'Date' column to string format for better compatibility with Plotly's animation_frame
map_data['Date'] = map_data['Date'].astype(str)

# Create the animated scatter_geo plot
fig = px.scatter_geo(map_data,
                     lat='Lat',
                     lon='Long',
                     color='Alert',
                     animation_frame='Date',
                     title='Animated Map: Alert Locations Over Time',
                     labels={'Lat': 'Latitude', 'Long': 'Longitude', 'Alert': 'Alert Type'},
                     color_continuous_scale=px.colors.sequential.Plasma,
                     projection='natural earth')

# Save the animated scatter_geo plot as an HTML file
animated_map_file_path = 'animated_map.html'
fig.write_html(animated_map_file_path)

animated_map_file_path

In [None]:
# Create a Sunburst Chart to show the breakdown of alert types by vehicle and then by date
# Due to the large size of the data, we'll use a subset for better performance
sunburst_data = df.sample(frac=0.05, random_state=1)

# Convert the 'Date' column to string format for better compatibility with Plotly
sunburst_data['Date'] = sunburst_data['Date'].astype(str)

# Create the Sunburst Chart
fig = px.sunburst(sunburst_data, path=['Vehicle', 'Date', 'Alert'], values='Speed',
                  title='Sunburst Chart: Breakdown of Alert Types by Vehicle and Date',
                  labels={'Speed': 'Average Speed', 'Vehicle': 'Vehicle', 'Date': 'Date', 'Alert': 'Alert Type'})

# Save the Sunburst Chart as an HTML file
sunburst_chart_file_path = 'sunburst_chart.html'
fig.write_html(sunburst_chart_file_path)

sunburst_chart_file_path


In [None]:
# Create a density heatmap instead of a choropleth map as Plotly's choropleth requires GeoJSON boundaries
# The density heatmap will give us a similar view into the concentration of alerts in different regions

# Create the density heatmap using Plotly Express
fig = px.density_mapbox(df, lat='Lat', lon='Long', z='Speed', radius=10,
                         center=dict(lat=df['Lat'].mean(), lon=df['Long'].mean()), zoom=3,
                         mapbox_style="stamen-terrain", title='Density Heatmap of Alerts')

# Save the density heatmap as an HTML file
density_heatmap_file_path = 'density_heatmap.html'
fig.write_html(density_heatmap_file_path)

density_heatmap_file_path


In [None]:
# Import Plotly Graph Objects
import plotly.graph_objects as go

# Create the 3D Surface Plot using Plotly Graph Objects
fig = go.Figure(data=[go.Surface(z=surface_data_pivot.values, x=surface_data_pivot.index, y=surface_data_pivot.columns)])

# Update the layout and labels
fig.update_layout(title='3D Surface Plot: Average Speed Across Different Locations',
                  scene=dict(xaxis_title='Latitude',
                             yaxis_title='Longitude',
                             zaxis_title='Average Speed'))

# Save the 3D Surface Plot as an HTML file
surface_plot_file_path = 'surface_plot_with_go.html'
fig.write_html(surface_plot_file_path)

surface_plot_file_path
