## Loading Data

In [69]:
from pathlib import Path
import csv

p = Path("../../data/traffic-violations.csv")

rows = []
data_section = False

with open(p, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()

        
        if not line:
            continue

        
        if line.startswith("%"):
            continue

        
        if line.lower().startswith("@data"):
            data_section = True
            continue

        
        if not data_section:
            continue

        
        rows.append(line)

print("Total data rows:", len(rows))
print("Example rows:", rows[:5])


Total data rows: 70340
Example rows: ["'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE',No,No,No,No,No,NC,'02 - Automobile',2013,HYUNDAI,SONATA,GRAY,13411f,No,WHITE,F,ASHEVILLE,NC,NC,'A - Marked Patrol',Citation", "'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEFORE RIGHT TURN',No,No,No,No,No,MD,'02 - Automobile',2015,FORD,FUSION,SILVER,21202i1,No,OTHER,M,'SILVER SPRING',MD,MD,'A - Marked Patrol',Citation", "'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE',No,No,No,No,No,MD,'02 - Automobile',2000,TOYOTA,CAMRY,BLACK,21902a2,No,BLACK,M,'SILVER SPRING',MD,MD,'B - Unmarked Patrol',Citation", "'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PUBLIC USE PROPERTY ON SUSPENDED LICENSE AND PRIVILEGE',No,No,No,No,No,MD,'02 - Automobile',2012,HOND,CROSSTOUR,BLACK,16303c,No,BLACK,M,COLUMBIA,MD,MD,'A - Marked Patrol',Citation", "'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE',No,No,No,Yes,No,MD,'02 - Automobile',2010,FORD,F250,BLACK,13411f,No,WHITE,M,'MOUNT AIRY',MD,MD,'A - Marked Pa

In [70]:
parsed = [row.split(",") for row in rows]

print("Columns per row:", len(parsed[0]))
print("First parsed row:", parsed[0])


Columns per row: 21
First parsed row: ["'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE'", 'No', 'No', 'No', 'No', 'No', 'NC', "'02 - Automobile'", '2013', 'HYUNDAI', 'SONATA', 'GRAY', '13411f', 'No', 'WHITE', 'F', 'ASHEVILLE', 'NC', 'NC', "'A - Marked Patrol'", 'Citation']


In [71]:
import pandas as pd

n_cols = len(parsed[0])
colnames = [f"col_{i}" for i in range(n_cols)]

df = pd.DataFrame(parsed, columns=colnames)

print(df.shape)
df.head()


(70340, 21)


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,No,No,NC,'02 - Automobile',2013,HYUNDAI,...,GRAY,13411f,No,WHITE,F,ASHEVILLE,NC,NC,'A - Marked Patrol',Citation
1,'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEF...,No,No,No,No,No,MD,'02 - Automobile',2015,FORD,...,SILVER,21202i1,No,OTHER,M,'SILVER SPRING',MD,MD,'A - Marked Patrol',Citation
2,'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE',No,No,No,No,No,MD,'02 - Automobile',2000,TOYOTA,...,BLACK,21902a2,No,BLACK,M,'SILVER SPRING',MD,MD,'B - Unmarked Patrol',Citation
3,'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PU...,No,No,No,No,No,MD,'02 - Automobile',2012,HOND,...,BLACK,16303c,No,BLACK,M,COLUMBIA,MD,MD,'A - Marked Patrol',Citation
4,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,Yes,No,MD,'02 - Automobile',2010,FORD,...,BLACK,13411f,No,WHITE,M,'MOUNT AIRY',MD,MD,'A - Marked Patrol',Citation


In [72]:
df.columns = [
    "Description",          
    "Belts",         
    "Personal.Injury",    
    "Property.Damage",       
    "Commercial.License",        
    "Commercial.Vehicle",
    "State",          
    "VehicleType",         
    "Year",    
    "Make",       
    "Model",        
    "Color",
    "Charge",
    "Contributed.To.Accident ",
    "Race",
    "Gender",      
    "Driver.City",
    "Driver.State",
    "DL.State",
    "Arrest.Type",
    "Violation.Type"
]
print(df.head())

                                         Description Belts Personal.Injury  \
0  'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...    No              No   
1  'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEF...    No              No   
2    'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE'    No              No   
3  'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PU...    No              No   
4  'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...    No              No   

  Property.Damage Commercial.License Commercial.Vehicle State  \
0              No                 No                 No    NC   
1              No                 No                 No    MD   
2              No                 No                 No    MD   
3              No                 No                 No    MD   
4              No                Yes                 No    MD   

         VehicleType  Year     Make  ...   Color   Charge  \
0  '02 - Automobile'  2013  HYUNDAI  ...    GRAY   13411f   
1  '02 - Automobil

In [73]:
# convert numeric columns
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df['ID'] = range(len(df))

df["Year"] = df["Year"].fillna(0)


In [74]:
df.dtypes

Description                  object
Belts                        object
Personal.Injury              object
Property.Damage              object
Commercial.License           object
Commercial.Vehicle           object
State                        object
VehicleType                  object
Year                        float64
Make                         object
Model                        object
Color                        object
Charge                       object
Contributed.To.Accident      object
Race                         object
Gender                       object
Driver.City                  object
Driver.State                 object
DL.State                     object
Arrest.Type                  object
Violation.Type               object
ID                            int64
dtype: object

In [75]:
cat_cols = df.select_dtypes(include="object").columns
num_cols = df.select_dtypes(include=["int64","float64"]).columns

cat_cols, num_cols

(Index(['Description', 'Belts', 'Personal.Injury', 'Property.Damage',
        'Commercial.License', 'Commercial.Vehicle', 'State', 'VehicleType',
        'Make', 'Model', 'Color', 'Charge', 'Contributed.To.Accident ', 'Race',
        'Gender', 'Driver.City', 'Driver.State', 'DL.State', 'Arrest.Type',
        'Violation.Type'],
       dtype='object'),
 Index(['Year', 'ID'], dtype='object'))