In [None]:
import pandas as pd
from pandas import DataFrame
from datetime import date
import calendar
import numpy as np
import sys, os
import matplotlib.pyplot as plt
import seaborn as sns
import time
import math
from IPython.display import display, HTML
from bs4 import BeautifulSoup
import requests
from scipy import stats
%matplotlib inline

cd = os.path.split(os.getcwd())[0]
if cd not in sys.path:
    sys.path.append(cd)

from lib import noaa, bexarcrime


# Set this to true if you want to run it from scratch
# IE pulling all the data from source and running all the slow
# functions
PROCESS_FULLY = False

# Table of Contents <a class="anchor" id="toc"></a>
* [City Selection](#selection)
  * [Loading](#loading)
  * [Merging](#merging)
  * [Visualisations](#visualisations)
* [Data Acquisition](#acquisition)
* [Data Analysis](#analysis)
  * [Hypthesis](#hypothesis)
  * [Exploration](#exploration)
* [Results and Conclusions](#results)

## City Selection <a class="anchor" id="selection"></a>

[Back to Table of Contents](#toc)

First we need to find which cities to examine. We loaded datasets full of factors that we thought influenced violent crime at the county level, and then chose the county seats of those counties to examine in detail

### Loading the Datasets <a class="anchor" id="loading"></a>

[Back to Table of Contents](#toc)

First we load the datasets into dataframes, later on we'll merge them into a single one to examine

County-level crime dataset

In [None]:
# using crime reports, not arrests 
crime = pd.read_csv('../data/CountyCrimeReports.tsv', sep='\t')
crime['FIPS'] = crime['FIPS_ST'] * 1000 + crime['FIPS_CTY']
crime['vcrime'] = crime['MURDER'] + crime['RAPE'] + crime['ROBBERY'] + crime['AGASSLT']
crime = crime.set_index('FIPS')
crime = crime[['COVIND', 'vcrime']]

Education dataset

In [None]:
edu = pd.read_excel('../data/Education.xls', skiprows=4)

# state and areas are named nicely in this dataset and will be kept for the later 'join'
# columns[-4:] include most recent data for adults eduction
# I chose the most recent because its not like the total number of HS dropouts is going to change THAT much
edu = edu[['FIPS Code', 'State', 'Area name'] + list(edu.columns[-4:])]
edu.rename(columns={'FIPS Code':'FIPS', \
                    'Area name':'County',\
                    'Percent of adults with less than a high school diploma, 2011-2015':'p_no_HS_dip', \
                    'Percent of adults with a high school diploma only, 2011-2015':'p_HS_dip',\
                    'Percent of adults completing some college or associate\'s degree, 2011-2015':'p_some_college',\
                    'Percent of adults with a bachelor\'s degree or higher, 2011-2015':'p_college_dip'}, inplace=True)
edu = edu.set_index('FIPS')

Population dataset

In [None]:
pop = pd.read_excel('../data/PopulationEstimates.xls', skiprows=2)

# average the columns
cols = ['POP_ESTIMATE_2010','POP_ESTIMATE_2011','POP_ESTIMATE_2012','POP_ESTIMATE_2013','POP_ESTIMATE_2014','POP_ESTIMATE_2015','POP_ESTIMATE_2016']
pop['avgpop'] = pop[cols].sum(axis=1) / len(cols)

# more averaging
cols = ['N_POP_CHG_2010','N_POP_CHG_2011','N_POP_CHG_2012','N_POP_CHG_2013','N_POP_CHG_2014','N_POP_CHG_2015','N_POP_CHG_2016']
pop['dpop/dt'] = pop[cols].sum(axis=1) / len(cols)

# only pull FIPS code, population, and dp
pop = pop[['FIPS', 'avgpop', 'dpop/dt']]
pop = pop.set_index('FIPS')    

Poverty estimate dataset

In [None]:
pov = pd.read_excel('../data/PovertyEstimates.xls', skiprows=3)
# only select poverty percentage
pov = pov[['FIPStxt', 'PCTPOVALL_2015']]
pov.rename(columns={'FIPStxt':'FIPS', 'PCTPOVALL_2015':'p_impoverished'}, inplace=True)
pov = pov.set_index('FIPS')
pov.p_impoverished = pd.to_numeric(pov.p_impoverished, errors='coerce')

Employment estimates dataset

In [None]:
emp = pd.read_excel('../data/Unemployment.xls', skiprows=9)

#avg unemployment
cols = ['Unemployment_rate_2007', 'Unemployment_rate_2008', 'Unemployment_rate_2009', 'Unemployment_rate_2010', 'Unemployment_rate_2011', 'Unemployment_rate_2012', 'Unemployment_rate_2013', 'Unemployment_rate_2014', 'Unemployment_rate_2015', 'Unemployment_rate_2016']
emp['p_unempl'] = emp[cols].sum(axis=1) / len(cols)

#only pull average and income
emp = emp[['FIPStxt', 'p_unempl', 'Median_Household_Income_2015']]
emp.rename(columns={'FIPStxt':'FIPS', 'Median_Household_Income_2015':'med_income'}, inplace=True)
emp = emp.set_index('FIPS')

### Merging the Datasets<a class="anchor" id="merging"></a>

[Back to Table of Contents](#toc)

We merge the datasets into a single one, indexed on the FIPS code. We remove the country- and state-level information

In [None]:
df = edu.join([pop,pov,emp,crime], how='outer')
df = df.where(df.State != 'PR').dropna(how='all') ## Puerto Rico has unreliable data

#pull out nationwide data
us = df.iloc[0]
df = df.drop(0)

#pull out statewide data
s = [x for x in range(1000,75000,1000)]
states = df.loc[s].dropna(how='all')

# all thats left is county level data
df = df.drop(states.index)

We need to normalize the change in population and violent crime rate. In this analysis, violent crime is expressed in crimes per 100,000 residents

In [None]:
#normalizing data
df['p_dpop'] = df['dpop/dt']/df['avgpop']
df['vcrime_rate'] = 100000 * df['vcrime']/df['avgpop']
df = df.drop(['dpop/dt', 'vcrime'], axis=1)

### Visualising the Data <a class="anchor" id="visualisations"></a>

[Back to Table of Contents](#toc)

We need to visualise the county-level data set to see if there are any interesting patterns. First we'll look at the basic statistics.

In [None]:
display(df.describe())

In [None]:
display(df.corr())

Next we'll compare some factors of different states, with bonus political standpoints according the the 2008 presidential election between Barack Obama and John McCain

Box plot of violent crime rate per state

In [None]:
# According to the 2008 presidential election
blue_states =['WA', 'OR', 'CA', 'NV', 'NM', 'CO', 'MN', 'IA', 'WI', 'IL', 'IN', 'MI', 'OH', 'PA', 'NY', 'VT', 'NH', 'ME', 'MA', 'CT', 'RI', 'NJ', 'DE', 'MD', 'VA', 'NC', 'FL', 'HI']
red_states = ['ID', 'MT', 'WY', 'UT', 'AZ', 'ND', 'SD', 'NE', 'KS', 'OK', 'TX', 'MO', 'AR', 'LA', 'WV', 'KY', 'TN', 'MS', 'AL', 'GA', 'SC', 'AK']
fix, ax = plt.subplots(figsize=(20,10))
pal = {state: 'r' if state in red_states else "b" for state in df.State}
sns.boxplot(ax=ax, x='State', y='vcrime_rate', data=df, palette=pal)

Box plot of percentages without a highschool diploma

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
sns.boxplot(ax=ax, x='State', y='p_no_HS_dip', data=df, palette=pal)

Graphs of factors to violent crime

In [None]:
sns.pairplot(df, y_vars=['vcrime_rate'], x_vars=['p_no_HS_dip', 'p_HS_dip', 'p_some_college', 'p_college_dip', 'avgpop',
   'p_impoverished', 'p_unempl', 'med_income', 'p_dpop', 'vcrime_rate'], dropna=True, size=10)

In [None]:
sns.distplot(df.vcrime_rate.dropna(), axlabel="Violent crime per 100,000")

In [None]:
sns.distplot(df.avgpop.dropna().apply(np.log10), axlabel="Population (log10)")

In [None]:
sns.distplot(df.p_unempl.dropna(), axlabel='Unemployment Rate')

In [None]:
sns.distplot(df.p_impoverished.dropna(), axlabel="Poverty Rate")

We want to select counties that have similar violent crime factors. For example, we don't want to end up comparing a well-to-do city in the suburbs of Maine against skid-row L.A.

First we bin the data into high, medium and low (based on national quantiles)

In [None]:
binned = pd.DataFrame({c : pd.qcut(df[c], 3, labels=['L', 'M', 'H']) for c in df.drop(['State', 'County', 'COVIND'], axis=1).columns}).join(df[['State', 'County', 'COVIND']])

Here is a table of the five worst counties in Texas (based on high rates of unemployment, crime, and population. Half of them are border towns with immigration problems

In [None]:
TX = binned.dropna(how='all').groupby(['vcrime_rate', 'p_unempl', 'avgpop'])
display(df.loc[TX.get_group(('H', 'H', 'H')).index].where(df.State == 'TX').dropna())

Graph of how Texas matches up nationwide to crime

In [None]:
sns.distplot(df.where(df.State=='TX').vcrime_rate.dropna(), label="Violent Crime Rates in Texas")
sns.distplot(df.vcrime_rate.dropna(), axlabel="Violent Crime Rates in US and Texas")

Table of the highest crime rates in the US with at least a population of 10000 to cull outliers. 

Note that high city crime does not necessarily match high county crime. For example, Chicago is a high crime city, but because it's split between two counties it's ranked lower on this list. St. Louis, however, is both a city and its own county, so it's data is more precise

In [None]:
display(df.where(df.avgpop > 10000).sort_values('vcrime_rate', ascending=False)[:20])

We grouped the data by violent crime rate, poverty rate, unemployment rate, and population.

We select counties with high rates of enemployment, violent crime, poverty, and large populations sampled using a nonrandom seed for consistency between runs.

In [None]:
# all counties grouped by H/M/L rates of each factor
groups = ['vcrime_rate', 'p_impoverished', 'p_unempl', 'avgpop']

c = binned.dropna(how='all').groupby(groups[::-1])
display(c.count().where(c.count().State > 10).dropna().sort_values('State', ascending=False)['State'].unstack())

In [None]:

selection = ('H','H','H', 'H')
for x in groups:
    print("%10s " %x[:10], end='')
print('')
for x in selection:
    print("%10s " %x[:10], end='')
HHHstates = df.loc[c.get_group(selection).index]
display(HHHstates.where(HHHstates.vcrime_rate > 800).dropna().sample(10, random_state=15))

From those counties, we selected the county seats as the cities
We looked up the latitude and longitude of them to match to NOAA's list of weather stations. We used Pythagoras' theorem to find the closest station to the city, as some cities may not have one within city limits.

In [None]:
cities = {
    'Philidelphia, PA' : (39.9526, -75.1652), #Philadelphia County
    'Albany, GA' : (31.5785, -84.1557), # Gougherty County
    'Memphis, TN' : (35.1495, -90.0490), # Shelby County and Crittenden County
    'Toledo, OH' : (41.6639, -83.5552), # Lucas County
    'Pine Bluff AR' : (34.2284, -92.0032), # Jefferson County
    'Detroit, MI' : (42.3314, -83.0458), # Wayne County
    'Baltimore, MD' : (39.2904, -76.6122), # Baltimore City
    'Flint, MI' : (43.0125, -83.6875), # Genesee County
    'St. Louis, MO' : (38.6270, -90.1994) # St. Louis City
}

Then we cross-referenced those locations with the NOAA ISD dataset to find the nearest stations.

The stations were filtered such that we only selected stations that had recent (more recent than 2012) data.

In [None]:
# list of stations with location, name, and recording beginning and end dates
hist = pd.read_csv('ftp://ftp.ncdc.noaa.gov/pub/data/noaa/isd-history.csv')
# only recent stations
hist = hist.where(hist.END > 20120101 ).dropna(how='all')

We defined some helper functions to process the station codes

In [None]:
def dist(a, b):
    return math.sqrt((a[0]-b[0])**2 + (a[1]-b[1])**2)

In [None]:
def format_station_code(usaf, wban):
    usafstr = str(int(usaf))
    wbanstr = str(int(wban))
    
    if len(usafstr) < 6:
        usafstr = '0'*(6-len(usafstr)) + usafstr
        
    if len(wbanstr) < 5:
        wbanstr = '0'*(5-len(wbanstr)) + wbanstr
        
    return usafstr + '-' + wbanstr

In [None]:
stations = dict()
for city in cities.keys():
    coord = cities[city]
    mindist = 999
    minindex = 0
    for index, row in hist.iterrows():
        d = dist(coord, (row['LAT'], row['LON']))
        if (d < mindist):
            mindist = d
            minindex = index
    print('Nearest ({:^6.2f}) ISD to {:20} is {:40} at loc {}'.format(mindist, city, hist.loc[minindex]['STATION NAME'], minindex))
    stations[city] = format_station_code(hist.loc[minindex]['USAF'], hist.loc[minindex]['WBAN'])
    print('\tStation code is {}'.format(stations[city]))

## Data Acquisition <a class="anchor" id="acquisition"></a>

[Back to Table of Contents](#toc)

Next we need to download the crime data for each city. We used SpotCrime to get the information, but we had to scrape their website.

First lets define some helper functions to download crime data

In [None]:
def get_soup(city):
    """Returns BeautifulSoup object for each set of links"""
    r = requests.get('https://spotcrime.com/' + city + 'daily')
    r2 = requests.get('https://spotcrime.com/' + city + 'daily/more')
    soup1 = BeautifulSoup(r.text, 'html.parser')
    soup2 = BeautifulSoup(r2.text, 'html.parser')
    
    return soup1, soup2

In [None]:
def get_links(soups):
    """Pulls all the links from each BeautifulSoup object into a single list"""
    links = []
    for dates in soups[0].find_all('ol', class_='list-unstyled'):
        for link in dates.find_all('a'):
            links.append(link['href'])
    for dates in soups[1].find_all('ol', class_='list-unstyled'):
        for link in dates.find_all('a'):
            links.append(link['href'])
    return links

In [None]:
def crime_df(links, base_url):
    """Loads each link and downloads the table of crimes, storing it in a list of lists
    Returns a dataframe
    """
    data = []
    for i, link in enumerate(links):
        print(i, link)
        try:
            r = requests.get(base_url + link)
        except:
            print('uh oh, timeout')
            time.sleep(10)
            r = requests.get(base_url+link)

        soup = BeautifulSoup(r.text, 'html.parser')
        table = soup.find('table')
        if table == None:
            print('no table, skipping')
            continue
        rows = table.find_all('tr')
        for row in rows:
            cols = row.find_all('td')
            cols = [ele.text.strip() for ele in cols]
            crime = [ele for ele in cols if ele]
            if len(crime) == 0:
                continue
            if len(crime) == 4:
                crime = ['A'] + crime
            data.append(crime)
    return pd.DataFrame(data, columns=['A', 'Crime', 'Time', 'Address', 'Details'])

In [None]:
base_url = 'https://spotcrime.com'
cities = [
    'mi/detroit/',
    'mo/st.+louis/',
    'md/baltimore/',
    'oh/toledo/',
    'ga/albany/',
    'mi/flint/',
    'tn/memphis/',
    'pa/philadelphia/',
]

Next we'll download all the crime data for each city and save it to a gzipped csv file in the data folder

In [None]:
if PROCESS_FULLY:
    for city in cities:
        soup1, soup2 = get_soup(city)

        links = get_links((soup1, soup2))
        df = crime_df(links, base_url)
        df = df.drop(['A', 'Address', 'Details'], axis=1)
        df.to_csv('../data/crime_{}_{}.csv.gz'.format(city[3:-1], city[:2]), compression='gzip', index=False)

For ease of use, we defined a class to hold the city datasets

In [None]:
class City:
    """Class representing each city
    
    Attributes:
        name (str): Name of the city City, 2-letter State
        filepath (str): Filepath for the crime data
        isd_code (str): Code for NOAA's ISD. Composed of USAF-WBAN id
        dfc (DataFrame): Dataframe holding all crime data for the city
        dfv (DataFrame): Dataframe holding only violent crime for the city
        dfw (DataFrame): Dataframe holding weather data for the city
        df (Dataframe) : Dataframe holding the merged weather+violentcrime data
        all_crime (DataFrame): Alias for dfc
        violent_crime (DataFrame): Alias for dfv
        weather (Dataframe): Alias for dfw
        
    """
    
    def __init__(self, name, filepath, isd_code):
        self.name = name
        self.filepath = filepath
        self.isd_code = isd_code
    
    def load_crime(self, process=True):
        """Reads crime dataset from filepath and stores in dfc and dfv
        
        Args:
            process (bool): Whether to immediately or lazily process the data
                Defaults to true, process the data
        
        Returns:
            self
        """
        self.dfc = pd.read_csv(self.filepath, compression='gzip')
        self.dfv = self.dfc.where(self.dfc.Crime.isin(['Assault', 'Robbery', 'Shooting'])).dropna()
        self.all_crime = self.dfc
        self.violent_crime = self.dfv
        if process: 
            return self.process_crime()
        
        return self
    
    def process_crime(self, how='Fast'):
        """Processes the crime by converting the dates to datetime dtypes
        
        Args:
            how (str): How to process the crime. Fast method drops data that doesn't
                            specify the time of day, and is in general faster
                            
        Returns:
            self
            
        """
        if how == 'Fast':
            self.dfc.Time = pd.to_datetime(self.dfc.Time, 
                                           format='%m/%d/%y. %I:%M %p.', 
                                           errors='coerce').dropna()
            self.dfc = self.dfc[self.dfc.Time.notnull()]
        else:
            self.dfc.Time = pd.to_datetime(self.dfc.Time, errors='coerce')
            
        self.dfc = self.dfc.set_index('Time')
        self.dfv = self.dfc.where(self.dfc.Crime.isin(['Assault', 'Robbery', 'Shooting', 'Arson'])).dropna()
        self.all_crime = self.dfc
        self.violent_crime = self.dfv
        
        return self
        
    def load_weather(self, start=2016, end=2018):
        """Loads weather over FTP from NOAA's website into dfw
        
        Args:
            start (int): Start year 
            end (int): End year
            
        Returns:
            self
        
        """
        self.dfw = noaa.noaa_from_web(self.isd_code, start, end).fillna(method='backfill')
            
        # drop relative humididty 
        self.dfw = self.dfw.drop('RHPeriod', axis = 1)
        
        # replace null values
        self.dfw['Temperature'] = self.dfw['Temperature'].replace(9999,np.nan)
        self.dfw['Pressure'] = self.dfw['Pressure'].replace(99999,np.nan)
        self.dfw['Humidity'] = self.dfw['Humidity'].replace(999, np.nan)
        self.dfw['Sky'] = self.dfw['Sky'].replace([9,99], np.nan)
        
        # scale values back
        self.dfw['Temperature'] = self.dfw['Temperature'].map(lambda x : x/10)
        self.dfw['Pressure'] = self.dfw['Pressure'].map(lambda x : x/10)
        
        # map sky oktas to coverage percentages, roughly
        self.dfw['Sky'] = self.dfw['Sky'].map(lambda x : x/8)
        
        # convert C to F
        self.dfw['Temperature'] = self.dfw['Temperature'].map(lambda x : x * 9/5 + 32)
        self.weather = self.dfw
        
        return self
        
    def merge_dfs(self, start='2016-01-01', end='2017-01-01'):
        """Merges violent crime and weather into a single dataset, df, cut into a range
        
        Args:
            start (date string): Start date to cut
            end (date string): End date to cut to
            
        Returns:
            Self
        
        """
        self.df = self.dfw.join(self.dfv, how='outer')
        self.df = self.df.groupby( 
                        [self.df.index.year, 
                         self.df.index.month, 
                         self.df.index.day, 
                         self.df.index.hour]
                    ).agg ({   
                         'Temperature' : 'mean', 
                         'Pressure' : 'mean',
                         'Humidity' : 'mean',
                         'Sky' : 'mean',
                         'Crime' : 'count'}
                    ).reset_index().rename(columns={
                        'level_0':'year',
                        'level_1':'month',
                        'level_2':'day',
                        'level_3':'hour',
                    })
        s = pd.to_datetime(self.df[['year', 'month', 'day', 'hour']])
        self.df = self.df.set_index(s).drop(['year', 'month', 'day', 'hour'], 
                                    axis=1)
        
        self.df = self.df.loc[self.df.index > start]
        self.df = self.df.loc[self.df.index < end]

        return self

A dictionary to hold all the city classes, initialized but not loaded

In [None]:
cities = {
    ##'Albany, GA': City('Albany, GA', '../data/crime_albany_ga.csv.gz', '722160-13869'),
    ## Not enough data for Albany
    'Baltimore, MD': City('Baltimore, MD', '../data/crime_baltimore_md.csv.gz', '745944-93784'),
    'Detroit, MI': City('Detroit, MI', '../data/crime_detroit_mi.csv.gz', '725375-14822'),
    'Flint, MI': City('Flint, MI', '../data/crime_flint_mi.csv.gz', '726370-14826'),
    'Memphis, TN': City('Memphis, TN', '../data/crime_memphis_tn.csv.gz', '723340-13893'),
    'Philadelphia, PA': City('Philadelphia, PA', '../data/crime_philadelphia_pa.csv.gz', '724080-13739'),
    ##'Pine Bluff AR': 
    ## no data for Pine Bluff
    'St. Louis, MO': City('St. Louis, MO', '../data/crime_st.+louis_mo.csv.gz', '725314-03960'),
    'Toledo, OH': City('Toledo, OH', '../data/crime_toledo_oh.csv.gz', '720275-04872')
}

Load the crime data into the classes

In [None]:
for city in cities.values():
    city.load_crime().load_weather().merge_dfs()

## Data Analysis <a class="anchor" id="analysis"></a>

[Back to Table of Contents](#toc)

Now that we've loaded all out data, we can begin to analyse the data. First let's define some hypotheses.

### Hypotheses <a class="anchor" id="hypothesis"></a>
* Temperature is positively correlated with the violent crime rates (Richard)
* Humidity and pressure have no impact on violent crime
* Violent crime is higher in summer vs winter (Lalo)
* Violent crime is higher during midnight hours (11:00pm - 2:00am) (Lexi)
* Is there a spike in total crime at 2:00AM because bars close? (Laxo)

[Back to Table of Contents](#toc)

### Exploration <a class="anchor" id="exploration"></a>

[Back to Table of Contents](#toc)

We need to explore the data first.

There are two ways to parse the time, by dropping non-listed times or converting them to default to 00:00. We chose to use by default dropping incomplete data, but we wanted to have a metric of how much data we lost.

Below is a comparison of the amount of entries lost as a percentage of the whole for each city, along with the times required to parse it. Dropping dates is considerably faster.

Because of the amount of lost data, we chose to particularly examine Philadelphia and Memphis.

In [None]:
if PROCESS_FULLY:
    datalosses = pd.DataFrame()

    for city in cities.values():
        print(city.name)

        city.load_crime(False)
        t1 = time.time()
        city.process_crime(how='Slow')
        t2 = time.time()
        slow = t2 - t1
        df1 = city.dfc

        city.load_crime(False)
        t1 = time.time()
        city.process_crime(how='Fast')
        t2 = time.time()
        fast = t2-t1
        df2 = city.dfc

        print('Slow: {} Fast: {}'.format(slow, fast))
        datalosses[city.name] = df2.Crime.value_counts()/df1.Crime.value_counts()

    datalosses

Below is a quick correlation of temperature on the X axis and violent crime incidences on the hour measured at that tempreature on the Y axis. There is somewhat of a positive correlation between the largest values in some cities, but it's completely swamped on the lower end by low values.

In [None]:
for city in cities.values():
    df = city.df[['Temperature', 'Crime']].dropna()
    sns.jointplot(x='Temperature', y='Crime', data=df)
    plt.title(city.name)

A distribution of the amounts of violent crimes in each city

In [None]:
for city in cities.values():
    print(city.name)
    city.dfc.Crime.value_counts().plot(kind='bar')
    plt.title(city.name)
    plt.show()

Perhaps there is a correlation between the day of the week and criminal activity. Perhaps on the weekend, with more free time, criminals may become violent instead of working at their jobs.

There does not appear to be a correlation

In [None]:
phili = cities['Philadelphia, PA']

In [None]:
# this is a hack
d = {'Monday' : 0, 'Tuesday' : 1, 'Wednesday' : 2, 'Thursday' : 3, 'Friday': 4 , 'Saturday' : 5, 'Sunday' : 6}

# Philadelphia, PA
df4 = phili.dfv
days = {}
for val in df4.index:
    day = calendar.day_name[val.weekday()]
    days[day] = days.get(day, 0) + 1
sorted_days = [w for w in sorted(days.items(), key=lambda x: d[x[0]])]
plt.bar(range(len(days)), [day[1] for day in sorted_days], align = 'center', color = 'b')
plt.xticks(range(len(days)), [day[0] for day in sorted_days])
plt.title('Philadelphia, PA Crimes per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Crimes per Day')
plt.show()

It may be worthwhile to examine the total crime, and not violent crime for weekend activity.

In [None]:
# this is a hack
d = {'Monday' : 0, 'Tuesday' : 1, 'Wednesday' : 2, 'Thursday' : 3, 'Friday': 4 , 'Saturday' : 5, 'Sunday' : 6}

# Philadelphia, PA
df4 = phili.dfc
days = {}
for val in df4.index:
    day = calendar.day_name[val.weekday()]
    days[day] = days.get(day, 0) + 1
sorted_days = [w for w in sorted(days.items(), key=lambda x: d[x[0]])]
plt.bar(range(len(days)), [day[1] for day in sorted_days], align = 'center', color = 'b')
plt.xticks(range(len(days)), [day[0] for day in sorted_days])
plt.title('Philadelphia, PA Crimes per Day of the Week')
plt.xlabel('Day of the Week')
plt.ylabel('Total Crimes per Day')
plt.show()

There appears to be a tendency to commit crimes during the week. Since this includes burglary and theft, this may be because criminals are more likely to commit crimes when they know their victim are away at work or school.

### Hypothesis 1: Violent crime is correlated with higher temperatures 

We want to see how violent crime is correlated with temperature. We can find the mean temperature of a city, and posit that when the temperature is unusually high (greater than 1 standard deviation), the amount of crimes are also higher.

* let p = the average crimes per hour in a city in a sample with higher temperature than average
* let mu be the average crimes per hour over a year
* let h_0 : p = mu and h_a : p > mu

In [None]:
mu = phili.df.Crime.mean()
n = 100
p0 = mu
stdtemp = phili.df.Temperature.std()
tempmu = phili.df.Temperature.mean()
alpha = 0.01

# Select only days which are greater than 1 std from the mean
sample = phili.df.Crime[phili.df.Temperature > tempmu + 1 * stdtemp].sample(n, random_state=57)
p = sample.mean()

S = stats.tstd(sample)
z = (p-p0)/(S/np.sqrt(n))
z_alpha = stats.norm().ppf(1-alpha)


if z > z_alpha:
    print("Since Z={0:4.2f} > Z_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_alpha))
else :
    print("Since Z={0:4.2f} < Z_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_alpha))

What about for abnormally low temperatures?

In [None]:
mu = phili.df.Crime.mean()
n = 100
p0 = mu
stdtemp = phili.df.Temperature.std()
tempmu = phili.df.Temperature.mean()
alpha = 0.01

# Select only days which are greater than 1 std from the mean
sample = phili.df.Crime[phili.df.Temperature > tempmu - 1 * stdtemp].sample(n, random_state=57)
p = sample.mean()

S = stats.tstd(sample)
z = (p-p0)/(S/np.sqrt(n))
z_alpha = stats.norm().ppf(1-alpha)


if z > z_alpha:
    print("Since Z={0:4.2f} > Z_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_alpha))
else :
    print("Since Z={0:4.2f} < Z_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_alpha))

### Hypothesis 2: Violent crime is not correlated with humidity or pressure

Let's try to compare violent crime with abnormal pressure

* let p = the average crimes per hour in a city in a sample with higher or lower pressure than average
* let mu be the average crimes per hour over a year
* let h_0 : p = mu and h_a : p > mu

In [None]:
mu = phili.df.Crime.mean()
n = 100
p0 = mu
stdpressure = phili.df.Pressure.std()
pressuremu = phili.df.Pressure.mean()
alpha = 0.01
half_alpha = alpha/2

# Select only days which are greater than 1 std from the mean
sample = phili.df.Crime[(phili.df.Pressure > pressuremu + 1 * stdpressure) | (phili.df.Pressure < mu - 1 * stdpressure)].sample(n, random_state=57)
p = sample.mean()

S = stats.tstd(sample)
z = (p-p0)/(S/np.sqrt(n))
z_half_alpha = stats.norm().ppf(1-half_alpha)


if np.abs(z) > z_half_alpha:
    print("Since |Z|={0:4.2f} > Z_half_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_half_alpha))
else :
    print("Since |Z|={0:4.2f} < Z_half_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_half_alpha))

What about humidity?

* let p = the average crimes per hour in a city in a sample with higher humidity than average
* let mu be the average crimes per hour over a year
* let h_0 : p = mu and h_a : p > mu

In [None]:
mu = phili.df.Crime.mean()
n = 100
p0 = mu
stdhu = phili.df.Humidity.std()
humu = phili.df.Humidity.mean()
alpha = 0.01

# Select only days which are greater than 1 std from the mean
sample = phili.df.Crime[phili.df.Temperature > humu + 1 * stdhu].sample(n, random_state=57)
p = sample.mean()

S = stats.tstd(sample)
z = (p-p0)/(S/np.sqrt(n))
z_alpha = stats.norm().ppf(1-alpha)


if z > z_alpha:
    print("Since Z={0:4.2f} > Z_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_alpha))
else :
    print("Since Z={0:4.2f} < Z_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_alpha))

One of the references suggested that dryer days correlate with more crime

In [None]:
mu = phili.df.Crime.mean()
n = 100
p0 = mu
stdhu = phili.df.Humidity.std()
humu = phili.df.Humidity.mean()
alpha = 0.01

# Select only days which are greater than 1 std from the mean
sample = phili.df.Crime[phili.df.Temperature < humu - 1 * stdhu].sample(n, random_state=57)
p = sample.mean()

S = stats.tstd(sample)
z = (p-p0)/(S/np.sqrt(n))
z_alpha = stats.norm().ppf(1-alpha)


if z > z_alpha:
    print("Since Z={0:4.2f} > Z_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_alpha))
else :
    print("Since Z={0:4.2f} < Z_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_alpha))

### Hypothesis 3: Violent crime is higher in summer than in winter

[Back to Table of Contents](#toc)

In [None]:
df = blt.df[['Temperature', 'Pressure', 'Crime']] # get df
df.corr().plot(kind = 'box', title = 'Baltimore, MD') # create graph of corr
df.corr().plot(kind = 'bar', title = 'Baltimore, MD') # create graph of corr
df.corr().plot(kind = 'line', title = 'Baltimore, MD') # create graph of corr
print('Baltimore, MD', df.corr()) # print chart of corr
df2 = stl.df[['Temperature', 'Pressure', 'Crime']] # get df 
df2.corr().plot(kind = 'box', title = 'St. Louis, MO') # create graph of corr
df2.corr().plot(kind = 'bar', title = 'St. Louis, MO') # create graph of corr
df2.corr().plot(kind = 'line', title = 'St. Louis, MO') # create graph of corr
print('St. Louis, MO', df2.corr()) # print chart of corr

In [None]:
blt = cities['Baltimore, MD']
blt.load_crime().load_weather().merge_dfs()
seasonalDFV = blt.dfv

springStartMonth = 3
springEndMonth = 5

summerStart = 6
summerEnd = 9
fallStart = 10
fallEnd = 11
winterStart =12
winterEnd = 3


SpringMask = (seasonalDFV.index.month >= springStartMonth) & (seasonalDFV.index.month <= springEndMonth) 
summerMask =(seasonalDFV.index.month >= summerStart ) & (seasonalDFV.index.month <= summerEnd)
fallMask= (seasonalDFV.index.month >= fallStart) & (seasonalDFV.index.month <= fallEnd)
wintMask = (seasonalDFV.index.month < winterEnd) | (seasonalDFV.index.month >= winterStart) 

winterDF = pd.DataFrame(seasonalDFV.loc[wintMask].Crime.value_counts())
springDF = pd.DataFrame(seasonalDFV.loc[SpringMask].Crime.value_counts())
summerDF = pd.DataFrame(seasonalDFV.loc[summerMask].Crime.value_counts())
fallDF = pd.DataFrame(seasonalDFV.loc[fallMask].Crime.value_counts())

#winterDF = winterDF.drop('Arson')
springDF = springDF.drop('Arson')
summerDF = summerDF.drop('Arson')
fallDF = fallDF.drop('Arson')

winterDF = winterDF.rename(columns={'Crime':'Winter'})
summerDF = summerDF.rename(columns={'Crime':'Summer'})
springDF = springDF.rename(columns={'Crime':'Spring'})
fallDF = fallDF.rename(columns={'Crime':'Fall'})

fallDF = fallDF.merge(winterDF, left_index=True, right_index=True,how='left')
springDF = springDF.merge(summerDF,left_index=True,right_index=True,how='left')
springDF = springDF.merge(fallDF,left_index=True,right_index=True,how='left')
crimesCountBySeason = springDF
crimesCountBySeason.index.name = 'Crime'

crimesCountBySeason.plot.bar()

In [None]:
phili = cities['Philadelphia, PA']
phili.load_crime().load_weather().merge_dfs()
seasonalDFV = phili.dfv

springStartMonth = 3
springEndMonth = 5

summerStart = 6
summerEnd = 9
fallStart = 10
fallEnd = 11
winterStart =12
winterEnd = 3


SpringMask = (seasonalDFV.index.month >= springStartMonth) & (seasonalDFV.index.month <= springEndMonth) 
summerMask =(seasonalDFV.index.month >= summerStart ) & (seasonalDFV.index.month <= summerEnd)
fallMask= (seasonalDFV.index.month >= fallStart) & (seasonalDFV.index.month <= fallEnd)
wintMask = (seasonalDFV.index.month < winterEnd) | (seasonalDFV.index.month >= winterStart) 

winterDF = pd.DataFrame(seasonalDFV.loc[wintMask].Crime.value_counts())
springDF = pd.DataFrame(seasonalDFV.loc[SpringMask].Crime.value_counts())
summerDF = pd.DataFrame(seasonalDFV.loc[summerMask].Crime.value_counts())
fallDF = pd.DataFrame(seasonalDFV.loc[fallMask].Crime.value_counts())

#winterDF = winterDF.drop('Arson')
springDF = springDF.drop('Arson')
summerDF = summerDF.drop('Arson')
fallDF = fallDF.drop('Arson')

winterDF = winterDF.rename(columns={'Crime':'Winter'})
summerDF = summerDF.rename(columns={'Crime':'Summer'})
springDF = springDF.rename(columns={'Crime':'Spring'})
fallDF = fallDF.rename(columns={'Crime':'Fall'})

fallDF = fallDF.merge(winterDF, left_index=True, right_index=True,how='left')
springDF = springDF.merge(summerDF,left_index=True,right_index=True,how='left')
springDF = springDF.merge(fallDF,left_index=True,right_index=True,how='left')
crimesCountBySeason = springDF
crimesCountBySeason.index.name = 'Crime'

crimesCountBySeason.plot.bar()

### Hypothesis 4: violent crime is higher during the midnight hours of 11:00PM to 2:00AM

[Back to Table of Contents](#toc)

Here are the distribution based on the hour of Philadelphia. Are the crimes higher during midnight hours?

In [None]:
phili.dfv.Crime.groupby(phili.dfv.index.hour).count().plot(kind='bar')
plt.title("Philadelphia hourly violent crime")

In [None]:
df = phili.dfv.between_time(start_time='23:00', end_time='23:59')
dff = phili.dfv.between_time(start_time = '00:00', end_time = '02:00')
df3 = phili.dfv.between_time(start_time = '02:01', end_time = '22:59')
df4 = phili.dfv

In [None]:
phMidHo = df.groupby(df.index.hour).count().sum() + dff.groupby(dff.index.hour).count().sum()
phOtherHo = df3.groupby(df3.index.hour).count().sum()

In [None]:
mMidHo = df.groupby(df.index.hour).count().sum() + df2.groupby(df2.index.hour).count().sum()
mOtherHo = df3.groupby(df3.index.hour).count().sum()
print("Philadelphia Midnight Hours : {}, Memphis Midnight Hours : {}".format(phMidHo[0], mMidHo[0]))
print("Philadelphia All Other Hours : {}, Memphis All Other Hours : {}".format(phOtherHo[0], mOtherHo[0]))

If there is no correlation, there should be an equal number of crime at all hours. Because the midnight hours are 3 in total, they should occupy $$\frac{3}{24}$$ of the total crimes

Let p = percentage of crimes committed during the midnight hours

Let H_0 : p = 1/6, and H_a : p > 1/6

In [None]:
n = 1000
p0 = 1/6

sample = phili.dfv.sample(n, random_state=31)
midnight = sample.between_time(start_time='23:00', end_time='23:59')
midnight2 = sample.between_time(start_time='0:00', end_time='2:00')
midnight = midnight.count() + midnight2.count()

pHat = midnight.sum()/n
sigma = np.sqrt((p0 * (1-p0)/n))
mu = p0

z = (pHat-p0)/sigma
alpha = 0.01
z_alpha = stats.norm().ppf(1-alpha)

if z > z_alpha:
    print("Since Z={0:4.2f} > Z_alpha={1:4.2f}, in Philadelpha can we reject H_0".format(z, z_alpha))
else :
    print("Since Z={0:4.2f} < Z_alpha={1:4.2f}, in Philadelpha we cannot reject H_0".format(z, z_alpha))

### Hypothesis 5: There is an increase in crimes committed during the time that bars close

[Back to Table of Contents](#toc)

There doesn't appear to be any correlation between the bar closing time and incidence of crime in Philadelphia

In [None]:
df = phili.dfc.between_time(start_time = '01:00', end_time = '03:00')
df.groupby(df.index.hour).count().plot(kind = 'bar', title = 'Philadelphia, PA', color = 'g')

### Results and Conclusions <a class="anchor" id="results"></a>

[Back to Table of Contents](#toc)