In [7]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

db_user = "deepakachyutha"
db_host = "localhost"
db_port = "5432"
db_name = "uscrashdata"

connection_str = f"postgresql://{db_user}@{db_host}:{db_port}/{db_name}"
engine = create_engine(connection_str)


In [None]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()
dfw = pd.read_sql('SELECT "Weather_Condition" FROM processed_accidents LIMIT 10000;', engine)
dfw['Weather_Encoded'] = le.fit_transform(dfw['Weather_Condition'].astype(str))

print(dfw[['Weather_Condition', 'Weather_Encoded']].head())

  Weather_Condition  Weather_Encoded
0     Partly Cloudy               19
1     Mostly Cloudy               17
2              Fair                3
3              Fair                3
4     Mostly Cloudy               17


In [22]:
#Considering 3 random samples from the database and evaluating the model performance on each sample
def get_data(condition, limit = 100000):
    """Fetch data from the database based on a condition and limit."""
    print(f"Fetching data with condition: {condition} and limit: {limit}")
    query = f"""
    SELECT 
        "Severity", 
        "TemperatureF", 
        "Visibilitymi", 
        "Weather_Condition",
        EXTRACT(HOUR FROM "Start_Time"::timestamp) as "Hour",
        EXTRACT(MONTH FROM "Start_Time"::timestamp) as "Month",
        EXTRACT(ISODOW FROM "Start_Time"::timestamp) as "Weekday"

    from processed_accidents
    WHERE {condition}
    order by random()
    LIMIT {limit};
    """

    df = pd.read_sql_query(query, engine)

    imputer = SimpleImputer(strategy='median')
    df[['TemperatureF', 'Visibilitymi']] = imputer.fit_transform(df[['TemperatureF', 'Visibilitymi']])

    label_encoder = LabelEncoder()
    df['Weather_Condition'] = label_encoder.fit_transform(df['Weather_Condition']).astype(str)

    return df

def train_and_evaluate(df):
    """Train a random forest and evaluate its accuracy."""
    
    if len(df) <10000:
        print("Not enough data to train the model.")
        return 0
    
    X = df.drop('Severity', axis=1)
    y = df['Severity']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    model = RandomForestClassifier(n_estimators=100, max_depth =10, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy


In [23]:
print("Experimenting with different data samples...")
print("Experiment 1: Random Samples stability test")
print("Hypothesis: Model accuracy should be stable across different random samples of the same size. If accuracy has high variance, it indicates sensitivity to data selection, suggesting the current dataset may not be representative enough.")
results_random = []
for i in range(1,4):
    df = get_data(condition="1=1", limit=100000)
    accuracy = train_and_evaluate(df)
    results_random.append(accuracy)

variance = np.var(results_random)
print(f"Random Samples Accuracies: {results_random}")
print(f"Random Samples Accuracy Variance: {variance:.6f}\n")
if variance < 0.001:
    print("Result: PASS - Model accuracy is stable across random samples.\n")
else:
    print("Result: FAIL - Model accuracy varies significantly across random samples.\n")



Experimenting with different data samples...
Experiment 1: Random Samples stability test
Hypothesis: Model accuracy should be stable across different random samples of the same size. If accuracy has high variance, it indicates sensitivity to data selection, suggesting the current dataset may not be representative enough.
Fetching data with condition: 1=1 and limit: 100000
Fetching data with condition: 1=1 and limit: 100000
Fetching data with condition: 1=1 and limit: 100000
Random Samples Accuracies: [0.7863, 0.78495, 0.79045]
Random Samples Accuracy Variance: 0.000005

Result: PASS - Model accuracy is stable across random samples.



Experiment 1 conclusion: After conduting random sampling three times the conclusion is the variance between random samples is negligible. 

[0.0015%,0.0005%,]


In [None]:
#due to corona pandamic and lockdowns, and timeline differences over the years 2016 and 2023 accident patterns may have changed over time. Hence testing temporal stability of the model
print("Experiment 2: Temporal Stability Test")
print("Hypothesis: Model accuracy should be consistent across different time periods. Significant accuracy differences may indicate temporal shifts in accident patterns, suggesting the need for time-aware modeling approaches.")

df_precovid = get_data("extract(year from \"Start_Time\"::timestamp) < 2020")
accuracy_precovid = train_and_evaluate(df_precovid)
print(f"Pre-COVID Accuracy: {accuracy_precovid:.4f}")

df_duringcovid = get_data("extract(year from \"Start_Time\"::timestamp) = 2020")
accuracy_duringcovid = train_and_evaluate(df_duringcovid)
print(f"During-COVID Accuracy: {accuracy_duringcovid:.4f}")

df_postcovid = get_data("extract(year from \"Start_Time\"::timestamp) > 2020")
accuracy_postcovid = train_and_evaluate(df_postcovid)
print(f"Post-COVID Accuracy: {accuracy_postcovid:.4f}\n")

accuracies = [accuracy_precovid, accuracy_duringcovid, accuracy_postcovid]
print(f"Temporal Stability Accuracies: {accuracies}")
print(f"Temporal Stability Accuracy Variance: {np.var(accuracies):.6f}\n")

Experiment 2: Temporal Stability Test
Hypothesis: Model accuracy should be consistent across different time periods. Significant accuracy differences may indicate temporal shifts in accident patterns, suggesting the need for time-aware modeling approaches.
Fetching data with condition: extract(year from "Start_Time"::timestamp) < 2020 and limit: 100000
Pre-COVID Accuracy: 0.6725
Fetching data with condition: extract(year from "Start_Time"::timestamp) = 2020 and limit: 100000
During-COVID Accuracy: 0.7796
Fetching data with condition: extract(year from "Start_Time"::timestamp) > 2020 and limit: 100000
Post-COVID Accuracy: 0.9025

Temporal Stability Accuracies: [0.6725, 0.77965, 0.90255]
Temporal Stability Accuracy Variance: 0.008834



Experiment 2 conclusion: The variance in data from different timelines[pre covid, during covid, post covid] is high and this diffenrece might cause inconsistencies in model training. 
[0.8834%,0.008557,]
From the experiment 2, it is concluded that the post covid data is more consistent and accurate over the data that dates before 2020. 

First itereation of experiment 2:
Pre-COVID Accuracy: 67.41%
During-COVID Accuracy: 78.36%
Post-COVID Accuracy: 90.06%

Second iteration of experiment 2:
Pre-COVID Accuracy: 67.25%
During-COVID Accuracy: 77.96%
Post-COVID Accuracy: 90.25%

The current data segregated into number of records per year
2016	409289
2017	714235
2018	888980
2019	949227
2020	1060660
2021	1439731
2022	1622535
2023	231177

The data post 2020 i.e; 2021, 2022, 2023 is more acccurate and predictable making that more feasible for model cration training, we also evaluate the model using the precovid and duringcovid data for model evaluation and comparison. 