# UCPD Data Scrape

## Import Things

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

## Get Field Interview Data

In [2]:
# Get headers

url0 = ('https://incidentreports.uchicago.edu/fieldInterviewsArchive.php?' +
       'startDate=1433134800&endDate=1597208400&offset=5')

# Get url contents
page0 = requests.get(url0)

# Store contents
soup0 = bs(page0.text, 'lxml')

# Get table data
table_rows0 = soup0.find('table').find_all('tr')

headers0 = [i.text for i in table_rows0[0].find_all(['th'])]

# Use Loop to get data
data0 = []

for n in range(363):
    offset = n*5
    
    # Get url
    url = ('https://incidentreports.uchicago.edu/fieldInterviewsArchive.php?' +
           'startDate=1433134800&endDate=1597208400&offset=' + str(offset))
    
    # Get url contents
    page = requests.get(url)
    
    # Store contents
    soup = bs(page.text, 'lxml')
    
    # Get table data
    table_rows = soup.find('table').find_all('tr')
    
    # Get data
    data_n = [[i.text for i in tr.find_all(['td','th'])] 
              for tr in table_rows
              if not any(x in tr.find_all(['td','th'])[0].text 
                         for x in ['Date','field inter'])]
    
    data0 = data0 + data_n

### Create Data Frame

In [3]:
df = pd.DataFrame(data0, columns = headers0)

# Export Raw Data
df.to_csv(r'data_field_raw.csv',index = False)

df

Unnamed: 0,Date/Time,Location,Initiated By,Race,Gender,Reason for Stop,Disposition,Search
0,6/3/2015 1:40 PM,1601 E 53rd,Citizen request for UCPD Response,African American,Female,Citizen observed subject having a verbal argum...,Name checked; no further action,No
1,6/3/2015 1:40 PM,1601 E 53rd,Citizen request for UCPD Response,African American,Male,Citizen observed subject having a verbal argum...,Name checked; no further action,No
2,6/4/2015 8:21 PM,5245 S Cottage Grove,Citizen request for UCPD Response,African American,Male,Complainant advised subject acted suspicious (...,Name checked; no further action,No
3,6/4/2015 10:01 PM,901 E 58th Street,UCPD,African American,Male,"Disturbance - Domestic Incident, prior inciden...",Name checked; no further action,No
4,6/4/2015 10:01 PM,901 E 58th Street,UCPD,African American,Female,"Disturbance - Domestic Incident, prior inciden...",Name checked; arrested by CPD,No
...,...,...,...,...,...,...,...,...
750,7/6/2020 1:24 AM,5472 S. Kenwood,UCPD,African American,Male,Matched description of armed robbery offenders,Released to parents,Yes
751,7/6/2020 1:24 AM,5472 S. Kenwood,UCPD,African American,Male,Matched description of armed robbery offenders,Released to parents,Yes
752,7/6/2020 1:24 AM,5472 S. Kenwood,UCPD,African American,Male,Matched description of armed robbery offenders,Released to parents,Yes
753,7/6/2020 1:24 AM,5472 S. Kenwood,UCPD,African American,Male,Matched description of armed robbery offenders,Released to parents,Yes


### Clean Data

In [4]:
# One Row has location (Ellis) in Date/Time, seems to have been on 8/6/2018

badRow = df['Date/Time'].str.contains('Ellis')

badData = df.loc[badRow,'Date/Time'].values[0]

## shift data over by 1

df[badRow] = df[badRow].shift(1,axis=1)

## Fill in corrected data

df.loc[badRow,['Date/Time','Location']] = ['8/6/2018 ' 
                                           + badData[:badData.find("PM") + 2],
                                           badData[badData.find("PM") + 3:]]

# Two more rows have data shifted to the left, shift to the right

badRow2 = df['Race'].isin(['Male','Female'])

df.loc[badRow2,df.columns[2:-2]] = df.loc[badRow2,
                                          df.columns[2:-2]].shift(1,axis=1)

# Clean value of African American
df = df.replace(['African\nAmerican'],'African American')

# Export Clean Data
df.to_csv(r'data_field.csv',index = False)

### Calculate Statistics

In [5]:
counts = df['Race'].value_counts()
percent = df['Race'].value_counts(normalize=True)
pd.DataFrame({'counts': counts, 'percent': percent})

Unnamed: 0,counts,percent
African American,697,0.943166
Caucasian,30,0.040595
Hispanic,12,0.016238


## Get Traffic Stop Data

In [6]:
# Get headers

url1 = ('https://incidentreports.uchicago.edu/trafficStopsArchive.php?' +
       'startDate=1433134800&endDate=1597208400&offset=5')

# Get url contents
page1 = requests.get(url1)

# Store contents
soup1 = bs(page1.text, 'lxml')

# Get table data
table_rows1 = soup1.find('table').find_all('tr')

headers1 = [i.text for i in table_rows1[0].find_all(['th'])]

# Use Loop to get data
data1 = []

for n in range(987):
    offset = n*5
    
    # Get url
    url = ('https://incidentreports.uchicago.edu/trafficStopsArchive.php?' +
           'startDate=1433134800&endDate=1597208400&offset=' + str(offset))
    
    # Get url contents
    page = requests.get(url)
    
    # Store contents
    soup = bs(page.text, 'lxml')
    
    # Get table data
    table_rows = soup.find('table').find_all('tr')
    
    # Get data
    data_n = [[i.text for i in tr.find_all(['td','th'])] 
              for tr in table_rows
              if not any(x in tr.find_all(['td','th'])[0].text 
                         for x in ['Date','traffic','stops'])]
    
    data1 = data1 + data_n

### Create Data Frame

In [7]:
df1 = pd.DataFrame(data1, columns = headers1)

# Export Raw Data
df1.to_csv(r'data_traffic_raw.csv',index = False)

df1

Unnamed: 0,Date/Time,Location,Race,Gender,IDOT Classification,Reason for Stop,Citations/Violations,Disposition,Search
0,6/3/2015 9:26 AM,5600 S Stony Island,African American,Female,Traffic Sign/Signal,"Stop Sign Violation, Failed to Yield to Pedest...",,Verbal Warning,No
1,6/4/2015 5:44 PM,5900 S Ellis,African American,Female,Follow Too Close,Following too closely to vehicle stopped for p...,,Verbal Warning,No
2,6/27/2015 11:36 AM,6121 S Cottage Grove,African American,Male,Lane Violation,Subject was driving erratically and cut off an...,,Verbal Warning,No
3,7/11/2015 10:04 PM,5500 S Cottage Grove,African American,Male,Moving Violation - Other,Failed to yield to CFD firetruck,,Verbal Warning,No
4,7/11/2015 10:47 PM,5600 S Drexel,African American,Male,Lane Violation,Wrong way on one way road,,Verbal Warning,No
...,...,...,...,...,...,...,...,...,...
4473,8/10/2020 12:44 PM,6200 S. Greenwood,African American,Male,Traffic sign/signal,Disobey stop sign and failure to give right of...,,Verbal Warning,NO
4474,8/11/2020 9:06 PM,5300 S. University,African American,Male,Traffic sign/signal,Disobeyed Stop Sign,,Verbal Warning,NO
4475,8/11/2020 11:05 PM,1412 E. 53rd Street,African American,Male,Lane Violation,Wrong way on one way street,One Citation Issued,Citation Issued,NO
4476,8/12/2020 4:56 PM,1455 E. 53rd Street,African American,Male,Lane Violation,Wrong way on one way street,,Verbal Warning,NO


### Clean Data

In [8]:
# Two rows have location in Date/Time, happening between 1/24/17-1/31/17

badRow3 = df1['Race'].isin(['Male','Female'])

## Reorganize columns
df1.loc[badRow3,'Race':] = df1.loc[badRow3,['Location','Race',
                                            'Reason for Stop','Gender',
                                            'Search','IDOT Classification',
                                            'Citations/Violations']].values

## Set Date to arbitrary date between 1/24/17 and 1/31/17
df1.loc[badRow3,'Location'] = df1.loc[badRow3,'Date/Time'].str[8:]
df1.loc[badRow3,'Date/Time'] = '1/27/2017 ' + df1.loc[badRow3,
                                                      'Date/Time'].str[:7]

# Harmonize Race variable
df1.loc[df1['Race'].str.count(' ') == 0,
        'Race'] = df1.loc[df1['Race'].str.count(' ') == 0,
                          'Race'].str.capitalize()
df1.loc[df1['Race'].str.contains('African',na=False),
        'Race'] = 'African American'
df1.loc[df1['Race'].str.contains('Indian',na=False),
        'Race'] = 'American Indian/Alaskan Native'

# Clean Gender
df1['Gender'] = df1['Gender'].str.capitalize()

# Export Clean Data
df1.to_csv(r'data_traffic.csv',index = False)

### Calculate Statistics

In [9]:
counts1 = df1['Race'].value_counts()
percent1 = df1['Race'].value_counts(normalize=True)
pd.DataFrame({'counts': counts1, 'percent': percent1})

Unnamed: 0,counts,percent
African American,3278,0.732023
Caucasian,741,0.165476
Asian,226,0.050469
Hispanic,217,0.048459
American Indian/Alaskan Native,12,0.00268
Native Hawaiian/Other Pacific Islander,4,0.000893
