# Chicago Public Official Overtime Analysis (Crime Dataset)

This notebook pulls data from the Chicago Data Portal and persists it in a SQLite Database

This work is split into an additional notebook.  This data was also analyzed with Airbnb's Superset

### Load dependencies

In [7]:
import pandas as pd
import sqlite3


---

## Data ingestion
---

### Download Links:

In [13]:
crime_download_links = {
    2016:"https://data.cityofchicago.org/api/views/kf95-mnd6/rows.csv?accessType=DOWNLOAD",
    2015:"https://data.cityofchicago.org/api/views/vwwp-7yr9/rows.csv?accessType=DOWNLOAD",
    2014:"https://data.cityofchicago.org/api/views/qnmj-8ku6/rows.csv?accessType=DOWNLOAD",
    2013:"https://data.cityofchicago.org/api/views/a95h-gwzm/rows.csv?accessType=DOWNLOAD",
    2012:"https://data.cityofchicago.org/api/views/hx8q-mf9v/rows.csv?accessType=DOWNLOAD"
}


### Download data from each url - store as Pandas DataFrames

Stored as dictionary:

`{df_name: DataFrame}`

This is one way to work with multiple dataframes

In [15]:
crime_data = {}
for year in crime_download_links.keys():
    crime_data['crime_'+ str(year)] = pd.read_csv(crime_download_links[year])
    print("Shape of crime data for "+ str(year)+ ": "+ str(crime_data['crime_'+ str(year)].shape))

Shape of crime data for 2016: (267453, 22)
Shape of crime data for 2012: (335738, 22)
Shape of crime data for 2013: (306761, 22)
Shape of crime data for 2014: (274666, 22)
Shape of crime data for 2015: (263231, 22)


---

# Crime Data - check schema
---

In [33]:
crime_data['crime_2012'].describe()



Unnamed: 0,ID,Beat,District,Ward,X Coordinate,Community Area,Y Coordinate,Year,Latitude,Longitude
count,335738.0,335738.0,335738.0,335731.0,334749.0,335712.0,334749.0,335738.0,334749.0,334749.0
mean,8684269.0,1164.185115,11.33413,22.834299,1164500.0,37.601974,1885368.0,2012.0,41.841044,-87.671861
std,378107.7,694.43644,6.918046,13.784707,18829.23,21.562858,35030.73,0.0,0.096473,0.067633
min,20224.0,111.0,1.0,1.0,0.0,0.0,0.0,2012.0,36.619446,-91.686566
25%,8560693.0,614.0,6.0,,,,,2012.0,,
50%,8692700.0,1024.0,10.0,,,,,2012.0,,
75%,8825174.0,1722.0,17.0,,,,,2012.0,,
max,10956990.0,2535.0,31.0,50.0,1205119.0,77.0,1951527.0,2012.0,42.022586,-87.524529


In [34]:
for file_name in crime_data.keys():
    print( crime_data[file_name].shape)   

(335738, 22)
(306761, 22)
(267453, 22)
(274666, 22)
(263231, 22)


In [99]:
crime_data['crime_2012'].columns

Index([u'ID', u'Case_Number', u'Date', u'Block', u'IUCR', u'Primary_Type',
       u'Description', u'Location_Description', u'Arrest', u'Domestic',
       u'Beat', u'District', u'Ward', u'Community_Area', u'FBI_Code',
       u'X_Coordinate', u'Y_Coordinate', u'Year', u'Updated_On', u'Latitude',
       u'Longitude', u'Location'],
      dtype='object')

In [101]:
crime_data['crime_2012']['Date'][:10]

0    05/01/2012 12:00:00 PM
1    06/01/2012 04:50:00 PM
2    11/01/2012 10:05:00 PM
3    01/01/2012 12:01:00 AM
4    11/01/2012 12:00:00 AM
5    06/16/2012 04:30:00 AM
6    11/21/2012 07:30:00 AM
7    05/18/2012 06:38:00 AM
8    03/26/2012 06:15:00 PM
9    08/07/2012 01:40:00 PM
Name: Date, dtype: object

In [38]:
crime_data.keys()

['crime_2012', 'crime_2013', 'crime_2016', 'crime_2014', 'crime_2015']

In [39]:
crime_data['crime_2013'].shape

(306761, 22)

In [40]:
def create_db(db_path):
    con = sqlite3.connect(db_path)#Create database to store movie and rating info for easy storage and querying
    cur = con.cursor()
    try:    
        cur.execute("DROP TABLE IF EXISTS crime")#drop if already exists tables in DB
        cur.execute("""CREATE TABLE crime
       
                   ( ID                        integer,
                    Case_Number              text,
                    Date                     DATETIME,
                    Block                    text,
                    IUCR                     text,
                    Primary_Type             text,
                    Description              text,
                    Location_Description     text,
                    Arrest                     bool,
                    Domestic                   bool,
                    Beat                      integer,
                    District                numeric,
                    Ward                    numeric,
                    Community_Area          numeric,
                    FBI_Code                 object,
                    X_Coordinate            numeric,
                    Y_Coordinate            numeric,
                    Year                      integer,
                    Updated_On               object,
                    Latitude                numeric,
                    Longitude               numeric,
                    Location                 text

                   )
       
        """)
    except Exception as e:
        print(e)
    con.commit()
    con.close()

In [41]:
table_cols = [  'ID',
                'Case_Number'              ,
                'Date'                     ,
                'Block'                    ,
                'IUCR'                     ,
                'Primary_Type'             ,
                'Description'              ,
                'Location_Description'     ,
                'Arrest'                     ,
                'Domestic'                   ,
                'Beat'                      ,
                'District'                ,
                'Ward'                    ,
                'Community_Area'          ,
                'FBI_Code'                 ,
                'X_Coordinate'            ,
                'Y_Coordinate'            ,
                'Year'                      ,
                'Updated_On'               ,
                'Latitude'                ,
                'Longitude'               ,
                'Location' ]

#table_cols = ['dept_name' , 'emp_name', 'title' , 'jan' , 'feb' ,'mar' , 'apr' , 'may' , 'jun' ,'jul' , 'aug' , 'sep' , 'oct' , 'nov' , 'dec' , 'total' , 'year']

In [42]:
len(table_cols)

22

In [44]:
db_path = 'chicago_data.db'
create_db(db_path)

In [46]:
con = sqlite3.connect(db_path)#Create database to store movie and rating info for easy storage and querying
cur = con.cursor()

for file_name in crime_data.keys():
    try: 
        crime_data[file_name].columns = table_cols
    except:
        print(file_name)
    crime_data[file_name].to_sql('crime', con, if_exists='append', index=False)

Testing

In [64]:
con = sqlite3.connect(db_path)#Create database to store movie and rating info for easy storage and querying
cur = con.cursor()

In [69]:
cur.execute("""SELECT COUNT(*) FROM ot""")

<sqlite3.Cursor at 0x116f7cab0>

In [70]:
cur.fetchall()

[(121097,)]

In [68]:
cur.execute("PRAGMA table_info(ot);")
print(cur.fetchall())


[(0, u'dept_name', u'text', 0, None, 0), (1, u'emp_name', u'text', 0, None, 0), (2, u'title', u'text', 0, None, 0), (3, u'jan', u'numeric', 0, None, 0), (4, u'feb', u'numeric', 0, None, 0), (5, u'mar', u'numeric', 0, None, 0), (6, u'apr', u'numeric', 0, None, 0), (7, u'may', u'numeric', 0, None, 0), (8, u'jun', u'numeric', 0, None, 0), (9, u'jul', u'numeric', 0, None, 0), (10, u'aug', u'numeric', 0, None, 0), (11, u'sep', u'numeric', 0, None, 0), (12, u'oct', u'numeric', 0, None, 0), (13, u'nov', u'numeric', 0, None, 0), (14, u'dec', u'numeric', 0, None, 0), (15, u'total', u'numeric', 0, None, 0), (16, u'year', u'integer', 0, None, 0)]


In [67]:
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cur.fetchall())

[(u'ot',), (u'crime',)]


In [71]:
cur.close()
con.close()

- 'dept_name'
- 'emp_name'
- 'title'
- 'jan'
- 'feb'
- 'mar'
- 'apr'
- 'may'
- 'jun'
- 'jul'
- 'aug' 
- 'sep'
- 'oct'
- 'nov'
- 'dec'
- 'total' 
- 'year'

In [55]:
f = cur.execute("""SELECT total, year 
                FROM ot
                GROUP BY year""")

In [50]:
q = """SELECT total, year 
                FROM ot
                GROUP BY year"""

In [56]:
f.fetchall()

[(15554, 2012),
 (609.3, 2013),
 (50211.49, 2014),
 (22773.31, 2015),
 (25249.6, 2016)]

In [57]:
cur.close()

In [54]:
q

'SELECT total, year \n                FROM ot\n                GROUP BY year'

In [51]:
df = pd.read_sql(q,con)

In [52]:
df.head()

Unnamed: 0,total,year
0,15554.0,2012
1,609.3,2013
2,50211.49,2014
3,22773.31,2015
4,25249.6,2016


In [53]:
x= 0
for i in crime_data.keys():
    print(crime_data[i].shape[0])
    x += crime_data[i].shape[0]
x

335738
306761
267453
274666
263231


1447849

In [45]:
cur.close()