In [534]:
# Importing dependencies that will be used for the code.
import json
import pandas as pd
from pprint import pprint
import datetime
import numpy as np
# Importing json data.
json_data = json.load(open("dogbites.json"))

In [535]:
# Creating a data specific DataFrame.
dog_data = pd.DataFrame(json_data["data"], columns=[
  "drop_code1",
  "drop_code2",
  "drop_code3",
  "drop_code4",
  "drop_code5",
  "drop_code6",
  "drop_code7",
  "drop_code8",
  "unique_id",
  "incident_date",
  "species",
  "breed",
  "age",
  "gender",
  "spay_neuter_status",
  "borough",
  "zip_code"
])

In [536]:
# Specifying columns to drop.
drop_columns = [
  "drop_code1",
  "drop_code2",
  "drop_code3",
  "drop_code4",
  "drop_code5",
  "drop_code6",
  "drop_code7",
  "drop_code8",
  "species",
  "unique_id"
]

In [537]:
# Cleaning column headers.
dog_data_clean = dog_data.drop(columns=drop_columns)
dog_data_clean.index.names = ["unique_id"]
dt_format = None
dog_data_clean['age'] = dog_data_clean["age"].replace("Y", "", regex=True)
dog_data_clean['incident_date'] = pd.to_datetime(dog_data_clean["incident_date"], format=dt_format)
dog_data_clean['breed'] = dog_data_clean["breed"].str.lower()
dog_data_clean = dog_data_clean[dog_data_clean["borough"] != 'Other']

In [538]:
# Sorting and printing data.
dog_data_clean = dog_data_clean.sort_values("incident_date")
dog_data_clean = dog_data_clean.reset_index(drop=True)
dog_data_clean

Unnamed: 0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
0,2015-01-01,pit bull,,M,False,Staten Island,10301
1,2015-01-01,american pit bull terrier/pit bull,,U,False,Brooklyn,11220
2,2015-01-01,bull dog,3,F,False,Queens,
3,2015-01-01,american pit bull terrier/pit bull,6,M,False,Brooklyn,11221
4,2015-01-01,mixed/other,10,M,True,Manhattan,10065
...,...,...,...,...,...,...,...
24982,2022-12-31,unknown,,U,False,Bronx,10463
24983,2022-12-31,yorkshire terrier crossbreed,1,M,False,Bronx,10452
24984,2022-12-31,unknown,,U,False,Bronx,10469
24985,2022-12-31,maltese,6,F,True,Bronx,10456


In [539]:
dog_data_clean['age'].value_counts()

age
2            1813
3            1676
1            1585
4            1269
5            1109
             ... 
2RS (MALE       1
7m              1
6MTH            1
3MTH            1
17W             1
Name: count, Length: 211, dtype: int64

In [540]:
# This was to drop the gross data that was uploaded to 'age'.

dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("-") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("/") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("&") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("mo") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("RS") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("WK") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("MTHS") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("mths") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("MTH") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("MONS") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("MOS") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("MO") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("yrs") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("wks") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("10+") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("Q") == False]

dog_data_clean['age'].str.strip()

dog_data_clean

Unnamed: 0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
2,2015-01-01,bull dog,3,F,False,Queens,
3,2015-01-01,american pit bull terrier/pit bull,6,M,False,Brooklyn,11221
5,2015-01-01,"dachshund, long haired",8,M,False,Manhattan,10026
8,2015-01-02,shih tzu,5,F,False,Queens,11434
9,2015-01-02,american pit bull terrier/pit bull,3,M,True,Manhattan,10128
...,...,...,...,...,...,...,...
24978,2022-12-31,german shepherd,2,F,False,Queens,11413
24980,2022-12-31,shih tzu / maltese mix,12,M,True,Queens,11364
24983,2022-12-31,yorkshire terrier crossbreed,1,M,False,Bronx,10452
24985,2022-12-31,maltese,6,F,True,Bronx,10456


In [541]:
# Quick function to cleanup the 'age' data.
def months_to_year(age):
  if age == " +" or age != None:
    if age[-1] == "MTHS":
      age = age.replace('MTHS', '')
      age = np.floor(int(age) / 12)
    elif age[-1] == "mths":
      age = age.replace('mths', '')
      age = np.floor(int(age) / 12)
    elif age[-1] == "MTH":
      age = age.replace("MTH", "")
      age = np.floor(int(age) / 12)
    elif age[-1] == "MOS":
      age = age.replace("MOS", "")
      age = np.floor(int(age) / 12)
    elif age[-1] == "M":
      age = age.replace("M", "")
      age = np.floor(int(age) / 12)
    elif age[-1] == "m":
      age = age.replace("m", "")
      age = np.floor(int(age) / 12)
    elif age[-1] == 'yrs':
      age = age.replace('yrs', '')
      age = np.floor(int(age))
    elif age[-1] == 'y':
      age = age.replace('y', '')
      age = np.floor(int(age))
    elif age[-1] == "R":
      age = age.replace('R', '')
      age = np.floor(int(age))
    elif age[-1] == "RS":
      age = age.replace('RS', '')
      age = np.floor(int(age))
    elif age[-1] == "WK":
      age = age.replace('WK', '0')
      age = np.floor(int(age))
    elif age[-1] == "W":
      age = age.replace('W', '0')
      age = np.floor(int(age))
    elif age[-1] == "wks":
      age = age.replace('wks', '0')
      age = np.floor(int(age))
  return np.floor(float(age))

dog_data_clean['age'] = dog_data_clean['age'].apply(lambda x: months_to_year(x))
dog_data_clean = dog_data_clean.reset_index(drop=True)
dog_data_clean

Unnamed: 0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
0,2015-01-01,bull dog,3.0,F,False,Queens,
1,2015-01-01,american pit bull terrier/pit bull,6.0,M,False,Brooklyn,11221
2,2015-01-01,"dachshund, long haired",8.0,M,False,Manhattan,10026
3,2015-01-02,shih tzu,5.0,F,False,Queens,11434
4,2015-01-02,american pit bull terrier/pit bull,3.0,M,True,Manhattan,10128
...,...,...,...,...,...,...,...
11519,2022-12-31,german shepherd,2.0,F,False,Queens,11413
11520,2022-12-31,shih tzu / maltese mix,12.0,M,True,Queens,11364
11521,2022-12-31,yorkshire terrier crossbreed,1.0,M,False,Bronx,10452
11522,2022-12-31,maltese,6.0,F,True,Bronx,10456


In [542]:
# Exporting DataFrames to csv files.
dog_data_clean.to_csv("cleaned_data.csv")