In [25]:
import pandas as pd
import numpy as np

### Importing the Data

In [26]:
#loading in the population data for percapita calculations as well as representative sampling
population = pd.read_excel("ignorefolder/population.xlsx")

population = (population
    .drop(columns="Unnamed: 1")
    .iloc[:, [0, 5]]
    .rename(columns= {population.columns[0]: "Geographic Area", population.columns[6]: "Population"})
    .dropna()
    .reset_index(drop=True)
)
population["Geographic Area"] = population["Geographic Area"].str.lstrip(".").str.lower()
#dropping puerto rico
population = population.drop(index=[56, 1, 2, 3, 4]).reset_index(drop=True)

#calculating proportion of population
population['Proportion'] = population["Population"] / population["Population"].iloc[0]
total_data = 1000000
population["Sample Counts"] = np.floor(population["Proportion"] * total_data)
population = population.drop(index=0).reset_index(drop=True)

state_to_abbrev = {
    "alabama": "AL",
    "alaska": "AK",
    "arizona": "AZ",
    "arkansas": "AR",
    "american samoa": "AS",
    "california": "CA",
    "colorado": "CO",
    "connecticut": "CT",
    "delaware": "DE",
    "district of columbia": "DC",
    "florida": "FL",
    "georgia": "GA",
    "guam": "GU",
    "hawaii": "HI",
    "idaho": "ID",
    "illinois": "IL",
    "indiana": "IN",
    "iowa": "IA",
    "kansas": "KS",
    "kentucky": "KY",
    "louisiana": "LA",
    "maine": "ME",
    "maryland": "MD",
    "massachusetts": "MA",
    "michigan": "MI",
    "minnesota": "MN",
    "mississippi": "MS",
    "missouri": "MO",
    "montana": "MT",
    "nebraska": "NE",
    "nevada": "NV",
    "new hampshire": "NH",
    "new jersey": "NJ",
    "new mexico": "NM",
    "new york": "NY",
    "north carolina": "NC",
    "north dakota": "ND",
    "northern mariana islands": "MP",
    "ohio": "OH",
    "oklahoma": "OK",
    "oregon": "OR",
    "pennsylvania": "PA",
    "puerto rico": "PR",
    "rhode island": "RI",
    "south carolina": "SC",
    "south dakota": "SD",
    "tennessee": "TN",
    "texas": "TX",
    "trust territories": "TT",
    "utah": "UT",
    "vermont": "VT",
    "virginia": "VA",
    "virgin islands": "VI",
    "washington": "WA",
    "west virginia": "WV",
    "wisconsin": "WI",
    "wyoming": "WY"
}
population["State"] = population["Geographic Area"].map(state_to_abbrev)


In [27]:
cols_to_keep = ['Severity', 'State', 'Temperature(F)', 'Wind_Chill(F)', 
                'Humidity(%)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Direction', 
                'Wind_Speed(mph)', 'Precipitation(in)', 'Weather_Condition', 'Amenity', 
                'Bump', 'Crossing', 'Give_Way', 'Junction', 'No_Exit',
                'Railway', 'Roundabout', 'Station', 'Stop', 'Traffic_Calming',
                'Traffic_Signal', 'Turning_Loop']


df = pd.read_csv('ignorefolder/US_Accidents_March23.csv')
df = df[cols_to_keep]
df = df.fillna(0)

### Representative Sampling

In [28]:
df = df.merge(population[["State", "Sample Counts"]], on="State", how='left')
grouped_by_state = df.groupby("State")


In [29]:
def sample_group(g):
    n = int(g["Sample Counts"].iloc[0])
    n = min(g.shape[0], n)
    return g.sample(n, replace=False)

sampled_dataset = grouped_by_state.apply(sample_group)
print(sampled_dataset.shape)

(988706, 25)


### Removing the group by index

In [30]:
sampled_dataset = sampled_dataset.reset_index(drop=True)

### One Hot Encoding

### Saving the Dataset

In [31]:
sampled_dataset.to_csv("cleaned_US_Accidents.csv")

sampled_dataset = pd.get_dummies(sampled_dataset, columns=["Wind_Direction", "Weather_Condition"])
sampled_dataset.drop(columns=["Sample Counts"])
print(sampled_dataset.shape)