## Portland Crime Data - In-class Exercise
### Instructions
+ ~~Import Dependencies~~
+ ~~Reference the file where the CSV is located~~
+ ~~Import the data into a Pandas DataFrame~~
+ ~~Check to see if there are any values with mispelled or similar values in "Offense Type"~~
+ ~~Combining similar offenses together~~
+ ~~Create a new DataFrame that looks into a specific neighborhood~~

In [1]:
#Standard Imports

import pandas as pd

In [2]:
#Read in file and create dataframe

file = '/Users/gta/dev/crime_incident_data2017.csv'
df = pd.read_csv(file)

#Find a distrubution of all offenses.

df['Offense Type'].value_counts()

Theft From Motor Vehicle                       7256
All Other Larceny                              4900
Motor Vehicle Theft                            4799
Vandalism                                      4189
Burglary                                       2901
Simple Assault                                 2453
Shoplifting                                    2332
Identity Theft                                 2040
Intimidation                                   1678
Aggravated Assault                             1217
Drug/Narcotic Violations                       1142
Theft of Motor Vehicle Parts or Accessories    1107
False Pretenses/Swindle/Confidence Game         972
Theft From Building                             961
Robbery                                         680
Counterfeiting/Forgery                          470
Weapons Law Violations                          298
Credit Card/ATM Fraud                           264
Rape                                            228
Arson       

In [3]:
#Determine head of dataframe (raw data)

df.head()

Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year
0,,17-X4762181,Person,,1,1/1/96,1/1/96,800,Sex Offenses,1,Rape,,,,,1/26/17,1/1/17
1,,17-X4757824,Property,Centennial,1,1/20/00,1/1/00,1615,Fraud Offenses,1,Identity Theft,,,,,1/20/17,1/1/17
2,200 BLOCK OF SE 78TH AVE,17-900367,Property,Montavilla,1,12/1/03,12/1/03,800,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5207,-122.583,7668150.0,682825.0,1/9/17,1/1/17
3,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Fraud Offenses,1,Identity Theft,,,,,1/5/17,1/1/17
4,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Larceny Offenses,1,All Other Larceny,,,,,1/5/17,1/1/17


In [42]:
#Display all columns in dataframe.

df.columns

Index(['Address', 'Case Number', 'Crime Against', 'Neighborhood',
       'Number of Records', 'Occur Date', 'Occur Month Year', 'Occur Time',
       'Offense Category', 'Offense Count', 'Offense Type', 'Open Data Lat',
       'Open Data Lon', 'Open Data X', 'Open Data Y', 'Report Date',
       'Report Month Year', 'Clean_offense_type'],
      dtype='object')

In [4]:
#Find distrubution of Sex Offenses

new_df = df[df['Offense Category'] == "Sex Offenses"]
new_df['Offense Type'].value_counts()

Rape                             228
Fondling                         147
Sodomy                            51
Sexual Assault With An Object     10
Name: Offense Type, dtype: int64

In [5]:
#Consolidate the sex-realted offenses into one category.

df['Offense Type'] = df['Offense Type'].replace({   'Rape': 'Sex-Related', 
                                                    'Prostitution': 'Sex-Related', 
                                                    'Fondling': 'Sex-Related', 
                                                    'Sodomy': 'Sex-Related',
                                                    'Statutory Rape': 'Sex-Related',
                                                    'Pornography/Obscene Material': 'Sex-Related',
                                                    'Sexual Assault With An Object':'Sex-Related',
                                                    'Commercial Sex Acts': 'Sex-Related'
                                                })
#Determine head of resulting dataframe.

df.head()

Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year
0,,17-X4762181,Person,,1,1/1/96,1/1/96,800,Sex Offenses,1,Sex-Related,,,,,1/26/17,1/1/17
1,,17-X4757824,Property,Centennial,1,1/20/00,1/1/00,1615,Fraud Offenses,1,Identity Theft,,,,,1/20/17,1/1/17
2,200 BLOCK OF SE 78TH AVE,17-900367,Property,Montavilla,1,12/1/03,12/1/03,800,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5207,-122.583,7668150.0,682825.0,1/9/17,1/1/17
3,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Fraud Offenses,1,Identity Theft,,,,,1/5/17,1/1/17
4,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Larceny Offenses,1,All Other Larceny,,,,,1/5/17,1/1/17


In [6]:
#Filer to find all offenses in one neighborhood -- Downtown and return distrubution.

df_neighborhood = df[df['Neighborhood'] == 'Downtown']
df_neighborhood['Offense Type'].value_counts()

Theft From Motor Vehicle                       640
All Other Larceny                              339
Vandalism                                      285
Shoplifting                                    255
Simple Assault                                 203
Intimidation                                   152
Identity Theft                                 135
Aggravated Assault                             103
Drug/Narcotic Violations                        92
Theft From Building                             89
Motor Vehicle Theft                             85
Robbery                                         75
Burglary                                        72
False Pretenses/Swindle/Confidence Game         63
Sex-Related                                     53
Counterfeiting/Forgery                          35
Theft of Motor Vehicle Parts or Accessories     29
Credit Card/ATM Fraud                           24
Weapons Law Violations                          17
Purse-Snatching                

In [7]:
#Group the following into one category within "Offense Type"
#Write function to make the substitution. 
#Problem is that if the value is not found in offense type, the function returns "None."

def category_combined(c):
    if c['Offense Type'] == 'Rape':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Prostitution':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Fondling':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Sodomy':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Statutory Rape':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Pornography/Obscene Material ':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Sexual Assault With An Object':
        return 'Sex-Related'
    elif c['Offense Type'] == 'Commercial Sex Acts':
        return 'Sex-Related'

#This line will apply the function to the Pandas series and return the results in a new column, thereby preserving the data
#in the original series. 

df['Clean_offense_type'] = df.apply(category_combined, axis = 1)
df.head()

Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year,Clean_offense_type
0,,17-X4762181,Person,,1,1/1/96,1/1/96,800,Sex Offenses,1,Sex-Related,,,,,1/26/17,1/1/17,
1,,17-X4757824,Property,Centennial,1,1/20/00,1/1/00,1615,Fraud Offenses,1,Identity Theft,,,,,1/20/17,1/1/17,
2,200 BLOCK OF SE 78TH AVE,17-900367,Property,Montavilla,1,12/1/03,12/1/03,800,Fraud Offenses,1,False Pretenses/Swindle/Confidence Game,45.5207,-122.583,7668150.0,682825.0,1/9/17,1/1/17,
3,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Fraud Offenses,1,Identity Theft,,,,,1/5/17,1/1/17,
4,,17-X4748982,Property,Southwest Hills,1,1/1/10,1/1/10,0,Larceny Offenses,1,All Other Larceny,,,,,1/5/17,1/1/17,


In [8]:
#This should return an empty series since all of the values in Offense Type was replaced with using the 
#Pandas replace method and the dictionary that was developed. 

df['Clean_offense_type'].value_counts()

Series([], Name: Clean_offense_type, dtype: int64)

In [17]:
#Determine distrubution of unique neighborhoods in dataset.

df['Neighborhood'].value_counts()

Downtown                2802
Hazelwood               2616
Lents                   1502
Powellhurst-Gilbert     1467
Old Town/Chinatown      1311
Centennial              1273
Northwest               1167
Montavilla              1081
Pearl                   1021
Lloyd                    953
Buckman West             911
Parkrose                 894
Eliot                    735
Richmond                 685
St Johns                 666
Cully                    656
Kerns                    652
Sunnyside                646
Sellwood-Moreland        635
Hosford-Abernethy        627
Mill Park                599
Goose Hollow             590
Brentwood-Darlington     560
Argay                    516
Foster-Powell            510
Madison South            504
Creston-Kenilworth       489
Buckman East             488
Kenton                   436
Wilkes                   433
                        ... 
Grant Park               148
Cathedral Park           139
Sabin                    135
Vernon        

In [21]:
#Create new dataframe with three neighborhoods with the "loc" method.

df_select = df.loc[(df['Neighborhood'] == 'Downtown') | (df['Neighborhood'] == 'Montavilla') | (df['Neighborhood'] == 'Grant Park')]

In [20]:
#Determine distrubution of new data frame with three neighborhoods selected. 

df_select['Offense Type'].value_counts()

Theft From Motor Vehicle                       819
All Other Larceny                              508
Vandalism                                      411
Motor Vehicle Theft                            289
Shoplifting                                    271
Simple Assault                                 261
Burglary                                       219
Intimidation                                   197
Identity Theft                                 191
Aggravated Assault                             134
Drug/Narcotic Violations                       122
Theft From Building                            108
Robbery                                         92
False Pretenses/Swindle/Confidence Game         86
Theft of Motor Vehicle Parts or Accessories     80
Sex-Related                                     66
Counterfeiting/Forgery                          48
Credit Card/ATM Fraud                           30
Weapons Law Violations                          26
Purse-Snatching                

In [29]:
#Filter dataframe using the iloc method to pull out columns 101 to 120 and return columns 10 to 20. 

df.iloc[101:120, 10:20]

Unnamed: 0,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year,Clean_offense_type
101,Sex-Related,,,,,1/29/17,1/1/17,
102,All Other Larceny,45.4865,-122.589,7666362.0,670391.0,1/1/17,1/1/17,
103,All Other Larceny,45.5048,-122.638,7654119.0,677388.0,1/1/17,1/1/17,
104,All Other Larceny,45.5839,-122.686,7642377.0,706543.0,1/1/17,1/1/17,
105,All Other Larceny,45.5215,-122.579,7669271.0,683090.0,1/1/17,1/1/17,
106,All Other Larceny,45.5576,-122.541,7679274.0,696003.0,1/1/17,1/1/17,
107,All Other Larceny,45.4643,-122.654,7649403.0,662733.0,1/1/17,1/1/17,
108,Vandalism,45.5847,-122.711,7636044.0,707015.0,1/1/17,1/1/17,
109,Theft From Motor Vehicle,45.5355,-122.422,7709609.0,687189.0,1/2/17,1/1/17,
110,Motor Vehicle Theft,45.5572,-122.613,7660947.0,696334.0,1/2/17,1/1/17,


In [34]:
#Find all entries where the offense type was "Extortion/Blackmail and send output to a new dataframe.

df_blackmail = df.loc[df['Offense Type'] == 'Extortion/Blackmail']
df_blackmail.head()

Unnamed: 0,Address,Case Number,Crime Against,Neighborhood,Number of Records,Occur Date,Occur Month Year,Occur Time,Offense Category,Offense Count,Offense Type,Open Data Lat,Open Data Lon,Open Data X,Open Data Y,Report Date,Report Month Year,Clean_offense_type
896,4500 BLOCK OF NE MARTIN LUTHER KING JR BLVD,17-3877,Property,King,1,1/4/17,1/1/17,2044,Extortion/Blackmail,1,Extortion/Blackmail,45.556,-122.661,7648504.0,696194.0,1/4/17,1/1/17,
2597,UNKNOWN ADDRESS,17-18145,Property,,1,1/19/17,1/1/17,1403,Extortion/Blackmail,1,Extortion/Blackmail,,,,,1/19/17,1/1/17,
7031,,17-X4777270,Property,St Johns,1,2/15/17,2/1/17,1255,Extortion/Blackmail,1,Extortion/Blackmail,,,,,2/15/17,2/1/17,
8375,2800 BLOCK OF SE 48TH AVE,17-54898,Property,Richmond,1,2/23/17,2/1/17,1312,Extortion/Blackmail,1,Extortion/Blackmail,45.502,-122.613,7660381.0,676215.0,2/23/17,2/1/17,
11856,,17-X4796200,Property,Glenfair,1,3/15/17,3/1/17,846,Extortion/Blackmail,1,Extortion/Blackmail,,,,,3/15/17,3/1/17,


In [38]:
#Determine distrubtion of neighborhoods of new dataframe.
df_blackmail['Neighborhood'].value_counts()

Cully               1
Centennial          1
Collins View        1
Wilkes              1
Sullivan's Gulch    1
Montavilla          1
Richmond            1
St Johns            1
Mt Scott-Arleta     1
Pleasant Valley     1
Glenfair            1
King                1
Name: Neighborhood, dtype: int64

In [39]:
#Determine shape of new dataframe (i.e., how many different neighborhoods are included in the new df)
df_blackmail.shape

(13, 18)

In [77]:
#Determine distrubtion of times that offenses occur

df['Occur Time'].value_counts()

0       1258
1200    1205
2200    1185
2100     874
2000     855
2300     839
1700     803
1800     794
800      748
1900     733
1500     549
100      545
900      540
1600     532
1400     492
1000     471
1300     439
1        375
1100     367
700      366
200      365
1830     334
300      331
2230     300
1730     297
2130     284
1630     282
1930     263
2030     262
2330     258
        ... 
822        2
347        2
606        2
646        2
552        2
424        2
643        2
546        2
449        2
1051       2
553        2
719        2
703        2
709        2
642        2
706        2
536        2
319        2
551        1
543        1
936        1
534        1
533        1
628        1
929        1
621        1
609        1
102        1
523        1
658        1
Name: Occur Time, Length: 1439, dtype: int64

In [75]:
#Determine the neighborhoods with the most offenses and sort in decending order.

count_df = df.groupby(df['Neighborhood'])['Offense Count'].count()
count_df.sort_values(ascending=False).head(10)

Neighborhood
Downtown               2802
Hazelwood              2616
Lents                  1502
Powellhurst-Gilbert    1467
Old Town/Chinatown     1311
Centennial             1273
Northwest              1167
Montavilla             1081
Pearl                  1021
Lloyd                   953
Name: Offense Count, dtype: int64

In [90]:
#Determine the top 10 offenses in the Hazelwood neighborhood.

hazelwood_df = df.loc[df['Neighborhood'] == 'Hazelwood']
hazelwood_df['Offense Type'].value_counts().sort_values(ascending = False).head(10)

Shoplifting                                456
Motor Vehicle Theft                        343
Theft From Motor Vehicle                   253
All Other Larceny                          221
Vandalism                                  209
Simple Assault                             140
Burglary                                   130
Identity Theft                             115
Intimidation                               103
False Pretenses/Swindle/Confidence Game     91
Name: Offense Type, dtype: int64

In [94]:
#Determine the top 10 offenses in the Downtown neighborhood.
downtown_df = df.loc[df['Neighborhood'] == 'Downtown']
downtown_df['Offense Type'].value_counts().sort_values(ascending = False).head(10)

Theft From Motor Vehicle    640
All Other Larceny           339
Vandalism                   285
Shoplifting                 255
Simple Assault              203
Intimidation                152
Identity Theft              135
Aggravated Assault          103
Drug/Narcotic Violations     92
Theft From Building          89
Name: Offense Type, dtype: int64

In [97]:
#Determine the top 10 offenses in the Lents neighborhood.

lents_df = df.loc[df['Neighborhood'] == 'Lents']
lents_df['Offense Type'].value_counts().sort_values(ascending = False).head(10)

Motor Vehicle Theft         254
Theft From Motor Vehicle    179
All Other Larceny           166
Vandalism                   154
Simple Assault              114
Burglary                     86
Aggravated Assault           66
Identity Theft               64
Shoplifting                  62
Intimidation                 60
Name: Offense Type, dtype: int64

In [101]:
#Determine the top 10 offenses in the Old Town/Chinatown neighborhood.

chinatown_df = df.loc[df['Neighborhood'] == 'Old Town/Chinatown']
chinatown_df['Offense Type'].value_counts().sort_values(ascending=False).head(10)

Drug/Narcotic Violations    265
All Other Larceny           232
Simple Assault              133
Theft From Motor Vehicle    118
Vandalism                    96
Intimidation                 90
Aggravated Assault           80
Theft From Building          53
Identity Theft               35
Motor Vehicle Theft          33
Name: Offense Type, dtype: int64

In [109]:
#Find all neighborhoods that have the "Theft of Motor Vehicle Parts or Accessories" offense.

parts_df = df[df['Offense Type'] == 'Theft of Motor Vehicle Parts or Accessories']
parts_df['Neighborhood'].value_counts().head()

Hazelwood              82
Lents                  56
Montavilla             47
Powellhurst-Gilbert    37
Centennial             35
Name: Neighborhood, dtype: int64

In [162]:
#How many different neighborhoods are present in the data set?
df['Neighborhood'].value_counts().shape

(96,)

In [156]:
#What is the most frequent offense in each neighborhood?

s = df.groupby(['Neighborhood', 'Offense Type'])['Neighborhood'].count()

In [169]:
s.groupby(level=0).nlargest(1).sort_values(ascending = False)

Neighborhood          Neighborhood          Offense Type            
Downtown              Downtown              Theft From Motor Vehicle    640
Hazelwood             Hazelwood             Shoplifting                 456
Pearl                 Pearl                 Theft From Motor Vehicle    356
Northwest             Northwest             Theft From Motor Vehicle    345
Buckman West          Buckman West          Theft From Motor Vehicle    301
Old Town/Chinatown    Old Town/Chinatown    Drug/Narcotic Violations    265
Lents                 Lents                 Motor Vehicle Theft         254
Goose Hollow          Goose Hollow          Theft From Motor Vehicle    226
Powellhurst-Gilbert   Powellhurst-Gilbert   Motor Vehicle Theft         210
Centennial            Centennial            Motor Vehicle Theft         201
Eliot                 Eliot                 Theft From Motor Vehicle    197
Lloyd                 Lloyd                 Shoplifting                 188
Parkrose           