In [None]:
import pandas as pd
import numpy as np
import json

df = pd.read_json("../data/auto.json")
pd.options.display.float_format = '{:,.2f}'.format
df.head()
## Sampling: 200 new observations
sample = df.sample(n=200, replace=True, random_state=21).copy()
concat_rows = pd.concat([df, sample], ignore_index=True)
## Adding the Year column
np.random.seed(21)
years = np.random.randint(1980, 2020, size=len(concat_rows))
concat_rows["Year"] = years
fines = concat_rows.copy()
## Owners dataframe
with open("../data/surname.json", "r") as f:
    raw_data = json.load(f)

columns = raw_data[0]
data = raw_data[1:]
surnames_df = pd.DataFrame(data, columns=columns)
surnames_clean = surnames_df["NAME"].str.replace(r"[^\w\s]", "", regex=True)
# Unique car numbers
unique_cars = concat_rows["CarNumber"].unique()
# Repeats of surnames
np.random.seed(21)
owner_names = np.random.choice(surnames_clean, size=len(unique_cars), replace=True)
owners = pd.DataFrame({
    "CarNumber": unique_cars,
    "SURNAME": owner_names
})
## Adding 5 of my observations
custom_rows = pd.DataFrame([
    {"CarNumber": "AB1234RUS", "Refund": 2.0, "Fines": 9000, "Make": "Lada", "Model": "Vesta", "Year": 2010},
    {"CarNumber": "CD5678RUS", "Refund": 1.0, "Fines": 1200, "Make": "Toyota", "Model": "Yaris", "Year": 2005},
    {"CarNumber": "EF9012RUS", "Refund": 0.0, "Fines": 3000, "Make": "Ford", "Model": "Fusion", "Year": 2015},
    {"CarNumber": "GH3456RUS", "Refund": 2.0, "Fines": 7400, "Make": "Volkswagen", "Model": "Tiguan", "Year": 2011},
    {"CarNumber": "IJ7890RUS", "Refund": 1.0, "Fines": 600, "Make": "Hyundai", "Model": "Accent", "Year": 2012}
])
fines = pd.concat([concat_rows, custom_rows], ignore_index=True)
## Removing 20 lines from owners and adding 3 new ones
owners = owners.iloc[:-20].copy()
extra_owners = pd.DataFrame([
    {"CarNumber": "ZZ0001RUS", "SURNAME": "Walker"},
    {"CarNumber": "YY0002RUS", "SURNAME": "Johnson"},
    {"CarNumber": "XX0003RUS", "SURNAME": "Brown"}
])
owners = pd.concat([owners, extra_owners], ignore_index=True)
## Union
merged_inner = pd.merge(fines, owners, on="CarNumber", how="inner")
merged_fines = pd.merge(fines, owners, on="CarNumber", how="left")
merged_owners = pd.merge(owners, fines, on="CarNumber", how="left")
pivot = fines.pivot_table(index=["Make", "Model"], columns="Year", values="Fines", aggfunc="sum", fill_value=0)
# Save files
fines.to_csv("../data/fines.csv", index=False)
owners.to_csv("../data/owners.csv", index=False)
