## Loading Data

In [291]:
from pathlib import Path
import csv
import numpy as np

p = Path("../../data/traffic-violations.csv")

rows = []
data_section = False

with open(p, "r", encoding="utf-8", errors="ignore") as f:
    for line in f:
        line = line.strip()

        
        if not line:
            continue

        
        if line.startswith("%"):
            continue

        
        if line.lower().startswith("@data"):
            data_section = True
            continue

        
        if not data_section:
            continue

        
        rows.append(line)

print("Total data rows:", len(rows))
print("Example rows:", rows[:5])


Total data rows: 70340
Example rows: ["'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE',No,No,No,No,No,NC,'02 - Automobile',2013,HYUNDAI,SONATA,GRAY,13411f,No,WHITE,F,ASHEVILLE,NC,NC,'A - Marked Patrol',Citation", "'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEFORE RIGHT TURN',No,No,No,No,No,MD,'02 - Automobile',2015,FORD,FUSION,SILVER,21202i1,No,OTHER,M,'SILVER SPRING',MD,MD,'A - Marked Patrol',Citation", "'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE',No,No,No,No,No,MD,'02 - Automobile',2000,TOYOTA,CAMRY,BLACK,21902a2,No,BLACK,M,'SILVER SPRING',MD,MD,'B - Unmarked Patrol',Citation", "'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PUBLIC USE PROPERTY ON SUSPENDED LICENSE AND PRIVILEGE',No,No,No,No,No,MD,'02 - Automobile',2012,HOND,CROSSTOUR,BLACK,16303c,No,BLACK,M,COLUMBIA,MD,MD,'A - Marked Patrol',Citation", "'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE',No,No,No,Yes,No,MD,'02 - Automobile',2010,FORD,F250,BLACK,13411f,No,WHITE,M,'MOUNT AIRY',MD,MD,'A - Marked Pa

In [292]:
parsed = [row.split(",") for row in rows]

print("Columns per row:", len(parsed[0]))
print("First parsed row:", parsed[0])


Columns per row: 21
First parsed row: ["'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED BY ANY STATE'", 'No', 'No', 'No', 'No', 'No', 'NC', "'02 - Automobile'", '2013', 'HYUNDAI', 'SONATA', 'GRAY', '13411f', 'No', 'WHITE', 'F', 'ASHEVILLE', 'NC', 'NC', "'A - Marked Patrol'", 'Citation']


In [293]:
import pandas as pd

n_cols = len(parsed[0])
colnames = [f"col_{i}" for i in range(n_cols)]

df = pd.DataFrame(parsed, columns=colnames)

print(df.shape)
df.head()


(70340, 21)


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,...,col_11,col_12,col_13,col_14,col_15,col_16,col_17,col_18,col_19,col_20
0,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,No,No,NC,'02 - Automobile',2013,HYUNDAI,...,GRAY,13411f,No,WHITE,F,ASHEVILLE,NC,NC,'A - Marked Patrol',Citation
1,'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEF...,No,No,No,No,No,MD,'02 - Automobile',2015,FORD,...,SILVER,21202i1,No,OTHER,M,'SILVER SPRING',MD,MD,'A - Marked Patrol',Citation
2,'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE',No,No,No,No,No,MD,'02 - Automobile',2000,TOYOTA,...,BLACK,21902a2,No,BLACK,M,'SILVER SPRING',MD,MD,'B - Unmarked Patrol',Citation
3,'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PU...,No,No,No,No,No,MD,'02 - Automobile',2012,HOND,...,BLACK,16303c,No,BLACK,M,COLUMBIA,MD,MD,'A - Marked Patrol',Citation
4,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,Yes,No,MD,'02 - Automobile',2010,FORD,...,BLACK,13411f,No,WHITE,M,'MOUNT AIRY',MD,MD,'A - Marked Patrol',Citation


In [294]:
df.columns = [
    "Description",          
    "Belts",         
    "Personal.Injury",    
    "Property.Damage",       
    "Commercial.License",        
    "Commercial.Vehicle",
    "State",          
    "VehicleType",         
    "Year",    
    "Make",       
    "Model",        
    "Color",
    "Charge",
    "Contributed.To.Accident ",
    "Race",
    "Gender",      
    "Driver.City",
    "Driver.State",
    "DL.State",
    "Arrest.Type",
    "Violation.Type"
]
print(df.head())

                                         Description Belts Personal.Injury  \
0  'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...    No              No   
1  'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEF...    No              No   
2    'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE'    No              No   
3  'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PU...    No              No   
4  'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...    No              No   

  Property.Damage Commercial.License Commercial.Vehicle State  \
0              No                 No                 No    NC   
1              No                 No                 No    MD   
2              No                 No                 No    MD   
3              No                 No                 No    MD   
4              No                Yes                 No    MD   

         VehicleType  Year     Make  ...   Color   Charge  \
0  '02 - Automobile'  2013  HYUNDAI  ...    GRAY   13411f   
1  '02 - Automobil

In [295]:
# convert numeric columns
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
df['ID'] = range(len(df))

df["Year"] = df["Year"].fillna(0)


In [296]:
df.isna().sum()

Description                 0
Belts                       0
Personal.Injury             0
Property.Damage             0
Commercial.License          0
Commercial.Vehicle          0
State                       0
VehicleType                 0
Year                        0
Make                        0
Model                       0
Color                       0
Charge                      0
Contributed.To.Accident     0
Race                        0
Gender                      0
Driver.City                 0
Driver.State                0
DL.State                    0
Arrest.Type                 0
Violation.Type              0
ID                          0
dtype: int64

In [297]:
(df == "?").sum()

Description                   0
Belts                         0
Personal.Injury               0
Property.Damage               0
Commercial.License            0
Commercial.Vehicle            0
State                         3
VehicleType                   0
Year                          0
Make                        448
Model                       455
Color                       888
Charge                        0
Contributed.To.Accident       0
Race                          0
Gender                        0
Driver.City                   8
Driver.State                  0
DL.State                     52
Arrest.Type                   0
Violation.Type                0
ID                            0
dtype: int64

In [298]:
df = df.replace("?", np.nan)

In [299]:
df.dtypes

Description                  object
Belts                        object
Personal.Injury              object
Property.Damage              object
Commercial.License           object
Commercial.Vehicle           object
State                        object
VehicleType                  object
Year                        float64
Make                         object
Model                        object
Color                        object
Charge                       object
Contributed.To.Accident      object
Race                         object
Gender                       object
Driver.City                  object
Driver.State                 object
DL.State                     object
Arrest.Type                  object
Violation.Type               object
ID                            int64
dtype: object

In [300]:
from sklearn.preprocessing import LabelEncoder

# Define target column
target_name = "Violation.Type"

# Encode the target labels
le = LabelEncoder()
y = le.fit_transform(df[target_name])

print("Encoded target classes:")
for idx, cls in enumerate(le.classes_):
    print(f"{cls} → {idx}")

# Define feature matrix X
X = df.drop(columns=[target_name])

print(f"\nX shape: {X.shape}, y shape: {y.shape}")
X.head()

Encoded target classes:
Citation → 0
SERO → 1

X shape: (70340, 21), y shape: (70340,)


Unnamed: 0,Description,Belts,Personal.Injury,Property.Damage,Commercial.License,Commercial.Vehicle,State,VehicleType,Year,Make,...,Color,Charge,Contributed.To.Accident,Race,Gender,Driver.City,Driver.State,DL.State,Arrest.Type,ID
0,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,No,No,NC,'02 - Automobile',2013.0,HYUNDAI,...,GRAY,13411f,No,WHITE,F,ASHEVILLE,NC,NC,'A - Marked Patrol',0
1,'DRIVER FAIL TO STOP AT RED TRAFFIC SIGNAL BEF...,No,No,No,No,No,MD,'02 - Automobile',2015.0,FORD,...,SILVER,21202i1,No,OTHER,M,'SILVER SPRING',MD,MD,'A - Marked Patrol',1
2,'DRIVING UNDER THE INFLUENCE OF ALCOHOL PER SE',No,No,No,No,No,MD,'02 - Automobile',2000.0,TOYOTA,...,BLACK,21902a2,No,BLACK,M,'SILVER SPRING',MD,MD,'B - Unmarked Patrol',2
3,'PERSON DRIVING MOTOR VEHICLE ON HIGHWAY OR PU...,No,No,No,No,No,MD,'02 - Automobile',2012.0,HOND,...,BLACK,16303c,No,BLACK,M,COLUMBIA,MD,MD,'A - Marked Patrol',3
4,'DISPLAYING EXPIRED REGISTRATION PLATE ISSUED ...,No,No,No,Yes,No,MD,'02 - Automobile',2010.0,FORD,...,BLACK,13411f,No,WHITE,M,'MOUNT AIRY',MD,MD,'A - Marked Patrol',4


In [301]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- Create the Solution Set ---
# This matches the 'loan-10k.sol.ex.csv' format
df_sol = pd.DataFrame({
    'ID': X_test['ID'],
    'Violation.Type': y_test
})

# We'll keep X_train and X_test as DataFrames for now
# This makes it easy to keep the 'ID' column separate
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"df_sol shape:  {df_sol.shape}")

X_train shape: (56272, 21)
X_test shape:  (14068, 21)
df_sol shape:  (14068, 2)


## Preprocessing

In [302]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

class IQRWinsorizer(BaseEstimator, TransformerMixin):
    def __init__(self, factor=1.5):
        self.factor = factor
        self.caps_ = {}

    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        for col in X.columns:
            if pd.api.types.is_numeric_dtype(X[col]):
                q1 = X[col].quantile(0.25)
                q3 = X[col].quantile(0.75)
                iqr = q3 - q1
                lower = q1 - self.factor * iqr
                upper = q3 + self.factor * iqr
                self.caps_[col] = (lower, upper)
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy()
        for col, (lower, upper) in self.caps_.items():
            if col in X.columns:
                X[col] = np.clip(X[col], lower, upper)
        return X

In [303]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler # Changed to RobustScaler
from sklearn.impute import SimpleImputer


# Identify categorical and numerical columns, excluding 'ID'
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
num_cols = X_train.select_dtypes(include=np.number).columns.drop("ID").tolist()

print(f"Categorical columns: {cat_cols}")
print(f"Numeric columns: {num_cols}")

# --- Define the pipelines (removed Imputers, as we used fillna(0)) ---
numeric_transformer = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("winsor", IQRWinsorizer(factor=1.5)),
    ("scaler", RobustScaler())
    
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Unknown")),
    ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

# --- Create the main preprocessor ---
preprocessor = ColumnTransformer([
    ("num", numeric_transformer, num_cols),
    ("cat", categorical_transformer, cat_cols)
], remainder="drop")

Categorical columns: ['Description', 'Belts', 'Personal.Injury', 'Property.Damage', 'Commercial.License', 'Commercial.Vehicle', 'State', 'VehicleType', 'Make', 'Model', 'Color', 'Charge', 'Contributed.To.Accident ', 'Race', 'Gender', 'Driver.City', 'Driver.State', 'DL.State', 'Arrest.Type']
Numeric columns: ['Year']


In [304]:
# Separate the 'ID' columns before processing
X_train_ids = X_train['ID']
X_test_ids = X_test['ID']

# Drop 'ID' from the feature sets so it's not processed
X_train_features = X_train.drop(columns=['ID'])
X_test_features = X_test.drop(columns=['ID'])

# Fit on X_train and transform X_train
print("Fitting preprocessor on training data...")
X_train_processed = preprocessor.fit_transform(X_train_features)

# Only transform X_test (uses stats from X_train)
print("Transforming test data...")
X_test_processed = preprocessor.transform(X_test_features)

# --- Debug: Check the shapes ---
print(f"Processed X_train shape: {X_train_processed.shape}")
print(f"Processed X_test shape: {X_test_processed.shape}")

Fitting preprocessor on training data...
Transforming test data...
Processed X_train shape: (56272, 8579)
Processed X_test shape: (14068, 8579)


In [305]:
# Get the new feature names from the preprocessor
cat_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(cat_cols)
all_feature_names = num_cols + list(cat_feature_names) # num_cols is already a list

print(f"Total features names generated: {len(all_feature_names)}")

# --- Create the preprocessed Train DataFrame ---
# No .toarray() needed because sparse_output=False
train_set = pd.DataFrame(X_train_processed, columns=all_feature_names)

# Add the ID and Target columns
# .values is critical here to align the new 0-N index
train_set['ID'] = X_train_ids.values
train_set['Violation.Type'] = y_train

# Reorder to put ID first and Target last
train_set = train_set[['ID'] + [col for col in train_set.columns if col not in ['ID', 'Violation.Type']] + ['Violation.Type']]


# --- Create the preprocessed Test DataFrame ---
test_set = pd.DataFrame(X_test_processed, columns=all_feature_names)

# Add the ID column
test_set['ID'] = X_test_ids.values

# Reorder to put ID first
test_set = test_set[['ID'] + [col for col in test_set.columns if col != 'ID']]


print("\nPreprocessed Training Set:")
print(train_set.head())
print("\nPreprocessed Test Set:")
print(test_set.head())

Total features names generated: 8579

Preprocessed Training Set:
      ID      Year  Description_'1 0F 3 REAR CLEARANCE LIGHTS INOPERATIVE'  \
0  35171 -0.555556                                                0.0        
1  37687 -0.888889                                                0.0        
2  44917  0.777778                                                0.0        
3  69723 -0.333333                                                0.0        
4   1543 -0.777778                                                0.0        

   Description_'1 OF 2 REVERSE LIGHTS2 OF 3 REAR ID LIGHTS INOPERATIVE'  \
0                                                0.0                      
1                                                0.0                      
2                                                0.0                      
3                                                0.0                      
4                                                0.0                      

   Description_

In [307]:
# Define output paths
output_path_train = "../../data/processed/traffic-violations-preprocessed-train.csv"
output_path_test= "../../data/processed/traffic-violations-preprocessed-test.csv"
output_path_sol= "../../data/processed/traffic-violations-preprocessed-sol.csv"

# Save the files
train_set.to_csv(output_path_train, index=False)
test_set.to_csv(output_path_test, index=False)
df_sol.to_csv(output_path_sol, index=False)

print(f"\nSuccessfully saved preprocessed files:")
print(f"Train: {output_path_train}")
print(f"Test:  {output_path_test}")
print(f"Sol:   {output_path_sol}")


Successfully saved preprocessed files:
Train: ../../data/processed/traffic-violations-preprocessed-train.csv
Test:  ../../data/processed/traffic-violations-preprocessed-test.csv
Sol:   ../../data/processed/traffic-violations-preprocessed-sol.csv
