# Install & Load Libraries

In [2]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 597, done.[K
remote: Counting objects: 100% (163/163), done.[K
remote: Compressing objects: 100% (81/81), done.[K
remote: Total 597 (delta 128), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (597/597), 196.59 KiB | 1.07 MiB/s, done.
Resolving deltas: 100% (302/302), done.
Installing RAPIDS remaining 25.04 libraries
Using Python 3.11.13 environment at: /usr
Resolved 175 packages in 1.75s
Downloading rmm-cu12 (1.5MiB)
Downloading ucx-py-cu12 (2.2MiB)
Downloading libcuspatial-cu12 (31.1MiB)
Downloading libcuvs-cu12 (1.1GiB)
Downloading pylibcudf-cu12 (26.4MiB)
Downloading nvidia-nvcomp-cu12 (44.1MiB)
Downloading librmm-cu12 (2.9MiB)
Downloading bokeh (6.6MiB)
Downloading dask (1.3MiB)
Downloading datashader (17.5MiB)
Downloading shapely (2.4MiB)
Downloading libcudf-cu12 (538.8MiB)
Downloading cucim-cu12 (5.6MiB)
Downloading libcugraph-cu12 (1.4GiB)
Downloading cuspatial-cu12 (4.1MiB)
Downloadi

In [3]:
! pip install kaggle --quiet

In [None]:
from google.colab import drive
drive.mount("/gdrive")

In [None]:
import cudf
import cuml
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


# Kaggle Authentication

In [10]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

mkdir: cannot create directory ‘/root/.kaggle’: File exists


# Data Extraction

In [11]:
!kaggle datasets download -d sobhanmoosavi/us-accidents

Dataset URL: https://www.kaggle.com/datasets/sobhanmoosavi/us-accidents
License(s): CC-BY-NC-SA-4.0
Downloading us-accidents.zip to /content
 93% 608M/653M [00:03<00:00, 216MB/s]
100% 653M/653M [00:06<00:00, 101MB/s]


In [12]:
!unzip us-accidents.zip

Archive:  us-accidents.zip
  inflating: US_Accidents_March23.csv  


In [13]:
import cudf

df = cudf.read_csv("/content/US_Accidents_March23.csv")

df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 46 columns):
 #   Column                 Dtype
---  ------                 -----
 0   ID                     object
 1   Source                 object
 2   Severity               int64
 3   Start_Time             object
 4   End_Time               object
 5   Start_Lat              float64
 6   Start_Lng              float64
 7   End_Lat                float64
 8   End_Lng                float64
 9   Distance(mi)           float64
 10  Description            object
 11  Street                 object
 12  City                   object
 13  County                 object
 14  State                  object
 15  Zipcode                object
 16  Country                object
 17  Timezone               object
 18  Airport_Code           object
 19  Weather_Timestamp      object
 20  Temperature(F)         float64
 21  Wind_Chill(F)          float64
 22  Humidity(%)            float64
 23  

In [None]:
import dask.dataframe as dd

df = dd.read_csv("/content/US_Accidents_March23.csv")

# df.info()
# df.isna().sum().compute()


In [None]:
from google.colab import drive
drive.mount("/content/drive")

# Sampling

In [None]:
# random sampling
import numpy as np
random_indexes = np.random.randint(0, len(df), size=4000)
df.iloc[:,random_indexes ].to_csv("accicent_sample.csv", index=False)

In [None]:
import cuml  # scikit-learn

# Data Cleaning

In [14]:
# this columns has same value  --->  -800 MB

np.unique(df["Country"])

df.drop(columns=["ID", "Country", "Description"], inplace=True)

In [15]:
# there is bool value so i chanege (True, False) to (1, 0) --->  - MB

for col in ["Junction", "No_Exit", "Railway", "Roundabout", "Station", "Stop", "Traffic_Calming", "Traffic_Signal", "Turning_Loop", "Amenity", "Severity", "Bump", "Crossing", "Give_Way"]:
  # df[col].unique()

  df[col] = df[col].astype(np.uint8)

In [16]:
# there is 2  value (day, night) so I change theme to (0 ,1)  ---> - MB


for col in ["Sunrise_Sunset", "Civil_Twilight", "Nautical_Twilight", "Astronomical_Twilight"]:
  # df[col].unique()

  df[col] = df[col].map({"Night": 0, "Day": 1}).astype(np.uint8)

In [17]:
df["Source"].unique()

# there is 3 value so I change str to a number--> - MB

df["Source"] = df["Source"].map({"Source1": 1, "Source2": 2, "Source3": 3}).astype(np.uint8)


In [18]:
# df["Wind_Chill(F)"].unique()
# df["Temperature(F)"].unique()
# df["Pressure(in)"].unique()
# df["Wind_Speed(mph)"].unique()

# these are coiumns that doesn"t necessary to have exact number

for col in ["Visibility(mi)", "Humidity(%)", "Wind_Chill(F)", "Temperature(F)", "Pressure(in)", "Wind_Speed(mph)", "Precipitation(in)"]:
    df[col] = df[col].astype(np.uint8)


# Scaling & Normalization

Test The Changes

In [None]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 43 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 uint8
 1   Severity               uint8
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float64
 5   Start_Lng              float64
 6   End_Lat                float64
 7   End_Lng                float64
 8   Distance(mi)           float64
 9   Street                 object
 10  City                   object
 11  County                 object
 12  State                  object
 13  Zipcode                object
 14  Timezone               object
 15  Airport_Code           object
 16  Weather_Timestamp      object
 17  Temperature(F)         uint8
 18  Wind_Chill(F)          uint8
 19  Humidity(%)            uint8
 20  Pressure(in)           uint8
 21  Visibility(mi)         uint8
 22  Wind_Direction         object
 23  Wind_Spee

In [None]:
df


Unnamed: 0,Source,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,Distance(mi),Street,...,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight,percipitation
0,2,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,0.00000,0.00000,0.010,I-70 E,...,0,0,0,0,0,0,0,0,0,0
1,2,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,0.00000,0.00000,0.010,Brice Rd,...,0,0,0,0,0,0,0,0,1,0
2,2,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,0.00000,0.00000,0.010,State Route 32,...,0,0,0,1,0,0,0,1,1,0
3,2,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,0.00000,0.00000,0.010,I-75 S,...,0,0,0,0,0,0,1,1,1,0
4,2,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,0.00000,0.00000,0.010,Miamisburg Centerville Rd,...,0,0,0,1,0,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7728389,1,2,2019-08-23 18:03:25,2019-08-23 18:32:01,34.002480,-117.379360,33.99888,-117.37094,0.543,Pomona Fwy E,...,0,0,0,0,0,1,1,1,1,0
7728390,1,2,2019-08-23 19:11:30,2019-08-23 19:38:23,32.766960,-117.148060,32.76555,-117.15363,0.338,I-8 W,...,0,0,0,0,0,1,1,1,1,0
7728391,1,2,2019-08-23 19:00:21,2019-08-23 19:28:49,33.775450,-117.847790,33.77740,-117.85727,0.561,Garden Grove Fwy,...,0,0,0,0,0,1,1,1,1,0
7728392,1,2,2019-08-23 19:00:21,2019-08-23 19:29:42,33.992460,-118.403020,33.98311,-118.39565,0.772,San Diego Fwy S,...,0,0,0,0,0,1,1,1,1,0


In [None]:
import cupy as cp

df = df.fillna(0)

In [None]:
df.info()

<class 'cudf.core.dataframe.DataFrame'>
RangeIndex: 7728394 entries, 0 to 7728393
Data columns (total 43 columns):
 #   Column                 Dtype
---  ------                 -----
 0   Source                 uint8
 1   Severity               uint8
 2   Start_Time             object
 3   End_Time               object
 4   Start_Lat              float64
 5   Start_Lng              float64
 6   End_Lat                float64
 7   End_Lng                float64
 8   Distance(mi)           float64
 9   Street                 object
 10  City                   object
 11  County                 object
 12  State                  object
 13  Zipcode                object
 14  Timezone               object
 15  Airport_Code           object
 16  Weather_Timestamp      object
 17  Temperature(F)         uint8
 18  Wind_Chill(F)          uint8
 19  Humidity(%)            uint8
 20  Pressure(in)           uint8
 21  Visibility(mi)         uint8
 22  Wind_Direction         object
 23  Wind_Spee

In [None]:
# Group by City and calculate mean and standard deviation for Start_Lat and Start_Lng
city_stats = df.groupby("City")[["Start_Lat", "Start_Lng"]].agg(["mean", "std"])

display(city_stats)

Unnamed: 0_level_0,Start_Lat,Start_Lat,Start_Lng,Start_Lng
Unnamed: 0_level_1,mean,std,mean,std
City,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Inola,36.160964,0.023708146,-95.551356,0.06294939
Hainesport,39.979754,0.006270321,-74.831356,0.016804722
Poplar Branch,36.245804,0.058668243,-75.890691,0.036185105
Boone,39.487399,2.28977691,-96.013074,8.067883583
Hathaway,46.272897,0.002356101,-106.208783,0.010980705
...,...,...,...,...
Chattaroy,47.922424,0.031798158,-117.344154,0.029775513
La Vista,41.183684,0.005890104,-96.076117,0.03236607
Nowata,36.689689,0.038277218,-95.646259,0.02931287
Linden,40.450438,1.786726721,-85.847359,17.32118585


In [None]:
df["z_score_lat"] = (df["Start_Lat"] - df["Start_Lat"].mean()) / df["Start_Lat"].std()
df["z_score_lng"] = (df["Start_Lng"] - df["Start_Lng"].mean()) / df["Start_Lng"].std()

# threshold
threshold = 2.3


# find outliners
outliers_df = df[((df["z_score_lat"] > threshold) | (df["z_score_lat"] < -threshold) | (df["z_score_lng"] > threshold) | (df["z_score_lng"] < -threshold))]


# Group by City to see outliers per city
outliers_by_city = outliers_df.groupby("City").size().reset_index(name="outlier_count")

outliers_by_city

Unnamed: 0,City,outlier_count
0,Crary,5
1,Fort Benton,2
2,Athol,21
3,Mill Creek,3
4,Cut Bank,32
...,...,...
297,Metaline Falls,1
298,Chewelah,2
299,Wolf Point,47
300,Ray,8


# Show Plot

In [None]:

# Sort the cities by outlier count in descending order
outliers_by_city_sorted = outliers_by_city.sort_values(by="outlier_count", ascending=False)

top_n = 100
fig = px.bar(outliers_by_city_sorted.head(top_n), x="City", y="outlier_count",
             title=f"Top {top_n} Cities with the Most Outliers")
fig.show()

In [None]:

if not outliers_df.empty:
    fig = px.scatter_mapbox(outliers_df,
                            lat="Start_Lat",
                            lon="Start_Lng",
                            hover_name="City",
                            hover_data=["Severity", "Start_Time"],
                            title="Outlier Accident Locations",
                            zoom=3,
                            height=500)

    fig.update_layout(mapbox_style="open-street-map")
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.show()
else:
    print("No outliers to display on the map.")