In [27]:
import pandas as pd
import numpy as np
import csv

from sqlalchemy import create_engine
# import config
import config
password=config.password


In [3]:
# Read the csv file
crime_df = pd.read_csv("assets/data/cdata_clean.csv", encoding='utf-8', parse_dates=['Date'])

In [5]:
crime_df.head()

Unnamed: 0,ID,Date,Primary_Type,Year,Latitude,Longitude
0,11818802,2019-09-05 23:51:00,CRIMINAL TRESPASS,2019,41.779618,-87.653768
1,11821042,2019-09-05 23:45:00,CRIM SEXUAL ASSAULT,2019,41.79453,-87.632179
2,11818805,2019-09-05 23:45:00,CRIMINAL DAMAGE,2019,41.815789,-87.623011
3,11819235,2019-09-05 23:41:00,CRIMINAL DAMAGE,2019,41.868518,-87.624113
4,11818848,2019-09-05 23:40:00,NARCOTICS,2019,41.750618,-87.597898


In [6]:
len(crime_df.index)

1035933

In [7]:
# Check date range
least_recent_date = crime_df['Date'].min()
recent_date = crime_df['Date'].max()
print(f'Start date: {least_recent_date} and Recent date: {recent_date}')

Start date: 2015-09-27 02:00:00 and Recent date: 2019-09-05 23:51:00


In [10]:
# Unique types of crime
types = crime_df['Primary_Type'].unique()
print(types)
print(len(types))

['CRIMINAL TRESPASS' 'CRIM SEXUAL ASSAULT' 'CRIMINAL DAMAGE' 'NARCOTICS'
 'BATTERY' 'BURGLARY' 'THEFT' 'MOTOR VEHICLE THEFT' 'WEAPONS VIOLATION'
 'ASSAULT' 'OTHER OFFENSE' 'ROBBERY' 'DECEPTIVE PRACTICE'
 'INTERFERENCE WITH PUBLIC OFFICER' 'SEX OFFENSE'
 'OFFENSE INVOLVING CHILDREN' 'PUBLIC PEACE VIOLATION'
 'CONCEALED CARRY LICENSE VIOLATION' 'GAMBLING' 'ARSON' 'STALKING'
 'KIDNAPPING' 'INTIMIDATION' 'HOMICIDE' 'LIQUOR LAW VIOLATION'
 'PROSTITUTION' 'OBSCENITY' 'HUMAN TRAFFICKING' 'PUBLIC INDECENCY'
 'OTHER NARCOTIC VIOLATION' 'NON-CRIMINAL'
 'NON-CRIMINAL (SUBJECT SPECIFIED)' 'NON - CRIMINAL']
33


In [11]:
# Total number of unique crimes
np.count_nonzero(types)

33

In [12]:
# Calculate missing values in the columns 
null_columns=crime_df.columns[crime_df.isnull().any()]
crime_df[null_columns].isnull().sum()

Series([], dtype: float64)

In [13]:
# Dropping rows with missing values
new_crime_df = crime_df.dropna(subset=['Latitude', 'Longitude'])
len(new_crime_df.index)

1035933

In [14]:
new_crime_df.head()

Unnamed: 0,ID,Date,Primary_Type,Year,Latitude,Longitude
0,11818802,2019-09-05 23:51:00,CRIMINAL TRESPASS,2019,41.779618,-87.653768
1,11821042,2019-09-05 23:45:00,CRIM SEXUAL ASSAULT,2019,41.79453,-87.632179
2,11818805,2019-09-05 23:45:00,CRIMINAL DAMAGE,2019,41.815789,-87.623011
3,11819235,2019-09-05 23:41:00,CRIMINAL DAMAGE,2019,41.868518,-87.624113
4,11818848,2019-09-05 23:40:00,NARCOTICS,2019,41.750618,-87.597898


In [25]:
# Connect to Postgres database 
engine = create_engine(f'postgresql://postgres:{config.password}@localhost:5432/crime_db')
print(engine)

Engine(postgresql://postgres:***@localhost:5432/crime_db)


In [26]:
# Save dataframe into the table
new_crime_df.to_sql('chicago', engine)