# Local data cleaning

In [1]:
# dependencies
import pandas as pd
import os
my_dir = os.path.dirname(os.path.abspath('__file__'))
os.chdir(my_dir)

In [2]:
# crime data
csv_2015 = 'data/uncleaned data/local data/CrimeData-2015.csv'
csv_2016 = 'data/uncleaned data/local data/CrimeData-2016.csv'
csv_2017 = 'data/uncleaned data/local data/CrimeData-2017.csv'
csv_2018 = 'data/uncleaned data/local data/CrimeData-2018.csv'
csv_2019 = 'data/uncleaned data/local data/CrimeData-2019.csv'
csv_2020 = 'data/uncleaned data/local data/CrimeData-2020.csv'
csv_2021 = 'data/uncleaned data/local data/CrimeData-2021.csv'
csv_2022 = 'data/uncleaned data/local data/CrimeData-2022.csv'

df_2015 = pd.read_csv(csv_2015)
df_2016 = pd.read_csv(csv_2016)
df_2017 = pd.read_csv(csv_2017)
df_2018 = pd.read_csv(csv_2018)
df_2019 = pd.read_csv(csv_2019)
df_2020 = pd.read_csv(csv_2020)
df_2021 = pd.read_csv(csv_2021)
df_2022 = pd.read_csv(csv_2022)

In [3]:
summary = df_2015.groupby(['CrimeAgainst', 'OffenseCategory']).size().reset_index(name = 'count')

In [4]:
summary

Unnamed: 0,CrimeAgainst,OffenseCategory,count
0,Person,Assault Offenses,4584
1,Person,Homicide Offenses,19
2,Person,Human Trafficking Offenses,13
3,Person,Kidnapping/Abduction,47
4,Person,Sex Offenses,393
5,Property,Arson,131
6,Property,Bribery,4
7,Property,Burglary,2461
8,Property,Counterfeiting/Forgery,561
9,Property,Embezzlement,48


In [10]:
df_2015.groupby(['CrimeAgainst']).count()['OffenseCount'][0]

5056

In [5]:
summary2 = df_2015.groupby(['Neighborhood', 'CrimeAgainst']).size().reset_index(name = 'count')

In [6]:
summary2

Unnamed: 0,Neighborhood,CrimeAgainst,count
0,Alameda,Person,7
1,Alameda,Property,95
2,Arbor Lodge,Person,29
3,Arbor Lodge,Property,285
4,Arbor Lodge,Society,7
...,...,...,...
265,Woodlawn,Property,127
266,Woodlawn,Society,10
267,Woodstock,Person,35
268,Woodstock,Property,325


In [12]:
pivot_summary2 = summary2.pivot_table(index = "Neighborhood", columns = 'CrimeAgainst', values = 'count', fill_value = 0).reset_index()

In [13]:
pivot_summary2

CrimeAgainst,Neighborhood,Person,Property,Society
0,Alameda,7,95,0
1,Arbor Lodge,29,285,7
2,Ardenwald,1,19,0
3,Argay,55,319,8
4,Arlington Heights,1,63,2
...,...,...,...,...
91,West Portland Park,15,53,0
92,Wilkes,39,277,11
93,Woodland Park,4,15,0
94,Woodlawn,28,127,10


In [31]:
crime_dict = {}
year_range = range(2015, 2023)
for year in year_range:
    crime_dict[year] = {}  
    for i in range(0,3):
        query=f'''df_{year}.groupby(['CrimeAgainst']).count()['OffenseCount'][{i}]'''
        result = eval(query)
        crime_dict[year][i] = result
crime_df = pd.DataFrame.from_dict(crime_dict, orient='index')

In [32]:
crime_df.rename(columns = {0:'Person_Crime', 1:'Property_Crime', 2:'Society_Crime'}, inplace = True)
crime_df.reset_index(inplace=True)
crime_df.rename(columns={'index': 'Year'}, inplace=True)

In [28]:
crime_dict2 = pd.DataFrame()
year_range = range(2015, 2023)

for year in year_range:
    my_df_name = f'df_{year}'
    my_df = globals()[my_df_name]
    my_df['year'] = year
    
    summary_df_name = f'summary_{year}_df'
    summary_df = my_df.groupby(['Neighborhood', 'CrimeAgainst', 'year']).size().reset_index(name='count')
    globals()[summary_df_name] = summary_df

dfs_to_merge = [globals()[f"summary_{year}_df"] for year in year_range]
merged_df = pd.concat(dfs_to_merge, ignore_index=True)

In [29]:
merged_df

Unnamed: 0,Neighborhood,CrimeAgainst,year,count
0,Alameda,Person,2015,7
1,Alameda,Property,2015,95
2,Arbor Lodge,Person,2015,29
3,Arbor Lodge,Property,2015,285
4,Arbor Lodge,Society,2015,7
...,...,...,...,...
2186,Woodlawn,Property,2022,335
2187,Woodlawn,Society,2022,3
2188,Woodstock,Person,2022,38
2189,Woodstock,Property,2022,532


In [37]:
merged_pivot_df = merged_df.pivot_table(index=['Neighborhood', 'year'], columns='CrimeAgainst', values='count', fill_value=0).reset_index()


In [38]:
merged_pivot_df

CrimeAgainst,Neighborhood,year,Person,Property,Society
0,Alameda,2015,7,95,0
1,Alameda,2016,4,132,1
2,Alameda,2017,11,136,1
3,Alameda,2018,18,146,2
4,Alameda,2019,11,112,0
...,...,...,...,...,...
763,Woodstock,2018,48,456,7
764,Woodstock,2019,41,435,6
765,Woodstock,2020,55,436,10
766,Woodstock,2021,55,497,9


In [43]:
neighborhood_zip_csv = 'data/uncleaned data/local data/PDX crime neighborhoods.csv'
neighborhood_zip_df = pd.read_csv(neighborhood_zip_csv)

In [44]:
crime_by_year_df = merged_pivot_df.merge(neighborhood_zip_df, on ='Neighborhood', how = 'left')

In [45]:
crime_by_year_df

Unnamed: 0,Neighborhood,year,Person,Property,Society,zip
0,Alameda,2015,7,95,0,97212
1,Alameda,2016,4,132,1,97212
2,Alameda,2017,11,136,1,97212
3,Alameda,2018,18,146,2,97212
4,Alameda,2019,11,112,0,97212
...,...,...,...,...,...,...
763,Woodstock,2018,48,456,7,97206
764,Woodstock,2019,41,435,6,97206
765,Woodstock,2020,55,436,10,97206
766,Woodstock,2021,55,497,9,97206


In [None]:
crime_zip_path = 'data/cleaned data/PDX_Crime_zip_year.csv'
crime_by_year_df.to_csv(crime_zip_path, index = False)

In [33]:
crime_df

Unnamed: 0,Year,Person_Crime,Property_Crime,Society_Crime
0,2015,5056,30743,1876
1,2016,7600,46926,2559
2,2017,8195,49022,2456
3,2018,8796,48952,2654
4,2019,8967,47418,2557
5,2020,8592,49069,1934
6,2021,9210,53751,1439
7,2022,9032,59749,1616


In [47]:
crime_path = 'data/cleaned data/PDX_Crime.csv'
crime_df.to_csv(crime_path, index = False)

In [51]:
# hpi data by zip by year
hpi_csv = 'data/uncleaned data/local data/cleaned_hpi_price_data.csv'
pdx_zip_csv = 'data/uncleaned data/local data/pdx_zip_codes.csv'

hpi_df = pd.read_csv(hpi_csv)
pdx_zip_df = pd.read_csv(pdx_zip_csv)

In [53]:
filtered_df = hpi_df[hpi_df['Five-Digit ZIP Code'].isin(pdx_zip_df['zip_code'])]
pdx_hpi_path = 'data/cleaned data/pdx_hpi_by_zip.csv'
filtered_df.to_csv(pdx_hpi_path, index = False)