In [201]:
# Importing dependencies that will be used for the code.
import json
import pandas as pd
from pprint import pprint
import datetime
import numpy as np
# Importing json data.
json_data = json.load(open("dogbites.json"))

In [202]:
# Creating a data specific DataFrame.
dog_data = pd.DataFrame(json_data["data"], columns=[
  "drop_code1",
  "drop_code2",
  "drop_code3",
  "drop_code4",
  "drop_code5",
  "drop_code6",
  "drop_code7",
  "drop_code8",
  "unique_id",
  "incident_date",
  "species",
  "breed",
  "age",
  "gender",
  "spay_neuter_status",
  "borough",
  "zip_code"
])

In [203]:
# Specifying columns to drop.
drop_columns = [
  "drop_code1",
  "drop_code2",
  "drop_code3",
  "drop_code4",
  "drop_code5",
  "drop_code6",
  "drop_code7",
  "drop_code8",
  "species",
  "unique_id"
]

In [204]:
# Cleaning column headers.
dog_data_clean = dog_data.drop(columns=drop_columns)
dog_data_clean.index.names = ["unique_id"]
dt_format = None
dog_data_clean['age'] = dog_data_clean["age"].replace("Y", "", regex=True)
dog_data_clean['incident_date'] = pd.to_datetime(dog_data_clean["incident_date"], format=dt_format)
dog_data_clean['breed'] = dog_data_clean["breed"].str.lower()
dog_data_clean = dog_data_clean[dog_data_clean["borough"] != 'Other']
dog_data_clean

Unnamed: 0_level_0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2018-01-01,unknown,,U,False,Brooklyn,11220
1,2018-01-04,unknown,,U,False,Brooklyn,
2,2018-01-06,pit bull,,U,False,Brooklyn,11224
3,2018-01-08,mixed/other,4,M,False,Brooklyn,11231
4,2018-01-09,pit bull,,U,False,Brooklyn,11224
...,...,...,...,...,...,...,...
26122,2022-12-31,yorkshire terrier crossbreed,1,M,False,Bronx,10452
26123,2022-12-31,unknown,,U,False,Bronx,10469
26124,2022-12-31,maltese,6,F,True,Bronx,10456
26125,2022-12-31,beagle,1,M,False,Bronx,10467


In [205]:
# Sorting and printing data.
dog_data_clean = dog_data_clean.sort_values("incident_date")
dog_data_clean = dog_data_clean.reset_index(drop=True)
dog_data_clean

Unnamed: 0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
0,2015-01-01,pit bull,,M,False,Staten Island,10301
1,2015-01-01,american pit bull terrier/pit bull,,U,False,Brooklyn,11220
2,2015-01-01,bull dog,3,F,False,Queens,
3,2015-01-01,american pit bull terrier/pit bull,6,M,False,Brooklyn,11221
4,2015-01-01,mixed/other,10,M,True,Manhattan,10065
...,...,...,...,...,...,...,...
24982,2022-12-31,unknown,,U,False,Bronx,10463
24983,2022-12-31,yorkshire terrier crossbreed,1,M,False,Bronx,10452
24984,2022-12-31,unknown,,U,False,Bronx,10469
24985,2022-12-31,maltese,6,F,True,Bronx,10456


In [None]:
# This was to drop the gross data that was uploaded to 'age'.
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("-") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("/") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("&") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("10+") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("Q") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("4 yrs 8 mo") == False]
dog_data_clean = dog_data_clean[dog_data_clean['age'].str.contains("WKS") == False]
dog_data_clean

In [207]:
dog_data_clean[dog_data_clean['age'].str.contains("MALE")]

Unnamed: 0,incident_date,breed,age,gender,spay_neuter_status,borough,zip_code
7155,2017-03-31,dachshund / schnauzer,2RS (MALE,U,True,Queens,11101


In [208]:
# Quick function to cleanup the 'age' data.
def months_to_year(age):
  if age == " +" or age != None:
    if age[-4:] == "MTHS":
      age = age.replace('MTHS', '')
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-4:] == "mths":
      age = age.replace('mths', '')
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-3:] == "MTH":
      age = age.replace("MTH", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-3:] == "MOS":
      age = age.replace("MOS", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-4:] == "MONS":
      age = age.replace("MONS", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-5:] == "MONS.":
      age = age.replace("MONS.", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-4:] == "mons":
      age = age.replace("mons", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-2:] == "MO":
      age = age.replace("MO", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-1] == "M":
      age = age.replace("M", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-1] == "m":
      age = age.replace("m", "")
      age = np.floor(float(str.strip(age)) / 12)
    elif age[-3:] == 'yrs':
      age = age.replace('yrs', '')
      age = np.floor(float(str.strip(age)))
    elif age[-1] == 'y':
      age = age.replace('y', '')
      age = np.floor(float(str.strip(age)))
    elif age[-1] == "R":
      age = age.replace('R', '')
      age = np.floor(float(str.strip(age)))
    elif age[-2:] == "RS":
      age = age.replace('RS', '')
      age = np.floor(float(str.strip(age)))
    elif age[-8:] == "RS (MALE":
      age = age.replace('RS (MALE', '')
      age = np.floor(float(str.strip(age)))
    elif age[-6:] == " (MALE":
      age = age.replace(' (MALE', '')
      age = np.floor(float(str.strip(age)))
    elif age[-2:] == "WK":
      age = age.replace('WK', '0')
      age = np.floor(float(str.strip(age)))
    elif age[-1] == "W":
      age = age.replace('W', '0')
      age = np.floor(float(str.strip(age)))
    elif age[-3:] == "wks":
      age = age.replace('wks', '0')
      age = np.floor(float(str.strip(age)))
  return np.floor(float(age))

dog_data_clean['age'] = dog_data_clean['age'].apply(lambda x: months_to_year(x))
dog_data_clean = dog_data_clean.reset_index(drop=True)
dog_data_clean

ValueError: could not convert string to float: ''

In [None]:
# Exporting DataFrames to csv files.
dog_data_clean.to_csv("cleaned_data.csv")