In [36]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np

# NSW Crime data

### Store CSV into DataFrame

In [2]:
file = "Resources/rci-offencebymonth.csv"
nsw_crime = pd.read_csv(file)
nsw_crime

Unnamed: 0,Statistical Division or Subdivision,LGA,Offence category,Subcategory,Jan 1995,Feb 1995,Mar 1995,Apr 1995,May 1995,Jun 1995,...,Mar 2012,Apr 2012,May 2012,Jun 2012,Jul 2012,Aug 2012,Sep 2012,Oct 2012,Nov 2012,Dec 2012
0,Inner Sydney,Botany Bay,Homicide,Murder (a),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Inner Sydney,Botany Bay,Homicide,Attempted murder,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Inner Sydney,Botany Bay,Homicide,"Murder accessory, conspiracy",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Inner Sydney,Botany Bay,Homicide,Manslaughter (a),0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Inner Sydney,Botany Bay,Assault,Domestic violence related assault,3.0,7.0,5.0,1.0,3.0,4.0,...,17.0,9.0,16.0,10.0,8.0,6.0,11.0,11.0,14.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9609,Prisons etc,Prisons,Other offences,,4.0,2.0,1.0,2.0,7.0,7.0,...,6.0,9.0,9.0,10.0,4.0,2.0,4.0,4.0,14.0,9.0
9610,,,,,,,,,,,...,,,,,,,,,,
9611,(a) Number of victims.,,,,,,,,,,...,,,,,,,,,,
9612,"During November and December 2011, NSW Police ...",,,,,,,,,,...,,,,,,,,,,


### Create new dataframe with selected columns

In [3]:
# filter the dataframe for only offence category "Theft"
theft_nsw = nsw_crime.loc[nsw_crime['Offence category']== 'Theft',:]
theft_nsw.head()
# Create dataframe for Years of interest (2010 -2012)
new_col = ['LGA','Subcategory','Jan 2010','Feb 2010','Mar 2010','Apr 2010','May 2010','Jun 2010',
'Jul 2010','Aug 2010','Sep 2010','Oct 2010','Nov 2010','Dec 2010','Jan 2011','Feb 2011','Mar 2011','Apr 2011','May 2011','Jun 2011','Jul 2011','Aug 2011','Sep 2011','Oct 2011','Nov 2011','Dec 2011','Jan 2012','Feb 2012','Mar 2012','Apr 2012','May 2012','Jun 2012','Jul 2012','Aug 2012','Sep 2012','Oct 2012','Nov 2012','Dec 2012']
theft_nsw_df = theft_nsw[new_col].copy() 
theft_nsw_df.reset_index(inplace = True, drop = True)
theft_nsw_df.head()

Unnamed: 0,LGA,Subcategory,Jan 2010,Feb 2010,Mar 2010,Apr 2010,May 2010,Jun 2010,Jul 2010,Aug 2010,...,Mar 2012,Apr 2012,May 2012,Jun 2012,Jul 2012,Aug 2012,Sep 2012,Oct 2012,Nov 2012,Dec 2012
0,Botany Bay,Break and enter dwelling,11.0,5.0,7.0,12.0,9.0,11.0,14.0,25.0,...,10.0,11.0,22.0,21.0,11.0,4.0,13.0,13.0,4.0,6.0
1,Botany Bay,Break and enter non-dwelling,3.0,10.0,7.0,2.0,6.0,5.0,6.0,7.0,...,3.0,5.0,3.0,2.0,1.0,4.0,2.0,4.0,1.0,5.0
2,Botany Bay,Receiving or handling stolen goods,4.0,7.0,3.0,8.0,5.0,2.0,2.0,5.0,...,4.0,7.0,1.0,7.0,3.0,4.0,4.0,3.0,6.0,4.0
3,Botany Bay,Motor vehicle theft,24.0,7.0,22.0,14.0,9.0,18.0,16.0,20.0,...,11.0,16.0,21.0,8.0,13.0,12.0,7.0,15.0,13.0,9.0
4,Botany Bay,Steal from motor vehicle,34.0,26.0,23.0,21.0,31.0,25.0,19.0,39.0,...,23.0,23.0,26.0,24.0,12.0,10.0,16.0,18.0,11.0,25.0


In [4]:
#check fro null values
theft_nsw_df.count()

LGA            1705
Subcategory    1705
Jan 2010       1705
Feb 2010       1705
Mar 2010       1705
Apr 2010       1705
May 2010       1705
Jun 2010       1705
Jul 2010       1705
Aug 2010       1705
Sep 2010       1705
Oct 2010       1705
Nov 2010       1705
Dec 2010       1705
Jan 2011       1705
Feb 2011       1705
Mar 2011       1705
Apr 2011       1705
May 2011       1705
Jun 2011       1705
Jul 2011       1705
Aug 2011       1705
Sep 2011       1705
Oct 2011       1705
Nov 2011       1705
Dec 2011       1705
Jan 2012       1705
Feb 2012       1705
Mar 2012       1705
Apr 2012       1705
May 2012       1705
Jun 2012       1705
Jul 2012       1705
Aug 2012       1705
Sep 2012       1705
Oct 2012       1705
Nov 2012       1705
Dec 2012       1705
dtype: int64

# Clean DataFrame

In [5]:
# Sum the the count of offences over the years on interest
sum_count = theft_nsw_df.sum(axis=1)

# append to the dataframe
sum_count_df = pd.DataFrame(sum_count)
sum_count_df= sum_count_df.rename(columns={0:'offence_count'})
nsw_theft = pd.concat([theft_nsw_df,sum_count_df], axis=1)

# Drop all monthly count columns
nsw_theft_df = nsw_theft.drop(['Jan 2010','Feb 2010','Mar 2010','Apr 2010','May 2010','Jun 2010',
'Jul 2010','Aug 2010','Sep 2010','Oct 2010','Nov 2010','Dec 2010','Jan 2011','Feb 2011','Mar 2011','Apr 2011','May 2011','Jun 2011','Jul 2011','Aug 2011','Sep 2011','Oct 2011','Nov 2011','Dec 2011','Jan 2012','Feb 2012','Mar 2012','Apr 2012','May 2012','Jun 2012','Jul 2012','Aug 2012','Sep 2012','Oct 2012','Nov 2012','Dec 2012'],axis=1)
nsw_theft_df

Unnamed: 0,LGA,Subcategory,offence_count
0,Botany Bay,Break and enter dwelling,481.0
1,Botany Bay,Break and enter non-dwelling,151.0
2,Botany Bay,Receiving or handling stolen goods,137.0
3,Botany Bay,Motor vehicle theft,506.0
4,Botany Bay,Steal from motor vehicle,851.0
...,...,...,...
1700,Prisons,Steal from dwelling,0.0
1701,Prisons,Steal from person,3.0
1702,Prisons,Stock theft,0.0
1703,Prisons,Fraud,20.0


In [6]:
# Groupby function to get the sum of theft crimes in each suburb
grouped_df = nsw_theft_df.groupby(['LGA'],as_index=False).sum().sort_values(by='offence_count',ascending=False)
grouped_df.head()

Unnamed: 0,LGA,offence_count
123,Sydney,67380.0
11,Blacktown,37054.0
100,Newcastle,25770.0
106,Parramatta,22439.0
77,Lake Macquarie,21120.0


In [16]:
# reset index
grouped_df.reset_index(inplace=True, drop=True)
grouped_df.head()

LGA              155
offence_count    155
dtype: int64

In [18]:
# rename columns according to schema table column names
df = grouped_df.rename(columns={'LGA':'suburb'})
df

Unnamed: 0,suburb,offence_count
0,Sydney,67380.0
1,Blacktown,37054.0
2,Newcastle,25770.0
3,Parramatta,22439.0
4,Lake Macquarie,21120.0
...,...,...
150,Urana,79.0
151,Jerilderie,77.0
152,Conargo,42.0
153,Unincorporated Far West,20.0


# Connect to local Database


In [37]:
from config import password

In [38]:
engine = create_engine(f'postgresql://postgres:{password}@localhost:5432/crime_db')

In [39]:
# Confirm tables
engine.table_names()

['nsw_crime']

In [40]:
# Load Dataframe into Database
df.to_sql(name='nsw_crime', con=engine, if_exists='append', index=True)

# SA Crime data

In [None]:
# EXTRACT 2010 DATA
sa_crime_2010 = "Resources/2010-11-data_sa_crime.csv"
sa_crime_2010_df = pd.read_csv(sa_crime_2010)
sa_crime_2010_df.head()

In [None]:
# EXTRACT 2011 DATA

sa_crime_2011 = "Resources/2011-12-data_sa_crime.xlsx"
sa_crime_2011_df = pd.read_excel(sa_crime_2011)
sa_crime_2011_df.head(50)

In [None]:
sa_crime_2012 = "Resources/2012-13-data_sa_crime.csv"
sa_crime_2012_df = pd.read_csv(sa_crime_2012)
sa_crime_2010_df.head()

In [None]:
# TRANSFORMING THE DATA
# JOIN DATA FOR 2010,2011 AND 2013 TOGETHER
frames = [sa_crime_2010_df,sa_crime_2011_df,sa_crime_2012_df]
sa_crime_df = pd.concat(frames)
sa_crime_df.head()

In [None]:
# NARROW DOWN TO THE COLUMNS IAM INTRESTED IN

sa_crime_df = sa_crime_df[['Suburb - Incident','Offence Level 2 Description','Offence count']].copy()

sa_crime_df

# DROP ROWS WITH NULL VALUES

sa_crime_df = sa_crime_df.dropna()

sa_crime_df

In [None]:
# NARROW THE DATA TO 'Theft and related offences"

sa_theft_df = sa_crime_df.loc[sa_crime_df['Offence Level 2 Description']== 'THEFT AND RELATED OFFENCES',:]

sa_theft_df

In [None]:
# CONVERTING THE 'offence count' TO NUMERIC

sa_theft_df['Offence count'] = sa_theft_df.loc[:,'Offence count'].astype(float)
# GROUPING THEFT BY SUBURBS)
suburb_df = sa_theft_df.groupby(['Suburb - Incident'])

print(suburb_df)
suburb_df.head()

In [None]:
# CALCULATING TOTAL THEFT PER SUBURB
suburb_df['Offence count'].sum()