In [None]:
import subprocess
import os
import math
import random
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json
import requests

#We divided the data into separate file according to the year
#data_2016.csv refers to the data file for year 2016
#You can run it on the enitre data as well but it takes a long time.

#Get the data and divide into small chunks
process_chunksize = 1000
mainDF = pd.read_csv('data_2010.csv', chunksize=process_chunksize)
print(mainDF)

census_api_url='https://api.census.gov/data/2017/acs/acs5?key=e335bd7f54d58725613d2bc9b7992f7d66313f86&get=NAME,B00001_001E,B19019_001E,B19083_001E&for=TRACT:*&in=state:06+COUNTY:037'
responseCensus = requests.get(census_api_url, headers={'Content-Type': 'application/json'})
textCensus=json.loads(responseCensus.content)
tractDF = pd.DataFrame(textCensus)
print('Shape of census tract file: ' + str(tractDF.shape))

tractDF.rename(columns = tractDF.iloc[0], inplace=True)
tractDF = tractDF.iloc[1:]
tractDF.rename(index=str,columns={'B00001_001E':'Pop','B19019_001E':'Income','B19083_001E':'Gini','tract':'Tract'}, inplace=True)
tractDF.set_index('Tract', inplace=True)
tractDF.drop(['NAME', 'state', 'county'],inplace=True, axis=1)
tractDF=tractDF[['Pop','Income','Gini']].apply(pd.to_numeric, errors='coerce')
print(tractDF.info())
print(tractDF.head(8))

def getTract(lonlat):
    tract = -999
    api_url='https://geo.fcc.gov/api/census/area?lat='+str(lonlat[0])+'&lon='+str(lonlat[1])+'&format=json'
    headers = {'Content-Type': 'application/json'}
    response = requests.get(api_url, headers=headers)
    try:
        tract=json.loads(response.content.decode('utf-8'))['results'][0]['block_fips'][5:11]
    except:
        print('No Tract found from FCC geo API')
    return tract


df = pd.DataFrame()
itld = 0
itlu = 1
for mainDF_chunk in mainDF:
    nLinesMainCSVd = itld * process_chunksize
    nLinesMainCSVu = itlu * process_chunksize
    print('New group from main DF, lines ' + str(nLinesMainCSVd) + ' to ' + str(nLinesMainCSVu))

    # Cleaning main
    mainDF_chunk.rename(columns=lambda x: x.replace(' ', ''), inplace=True)
    mainDF_chunk.dropna(subset=['Location'], inplace=True)
    mainDF_chunk['Location'] = mainDF_chunk['Location'].map(lambda x: eval(str(x)))
    mainDF_chunk[['DateReported', 'DateOccurred', 'TimeOccurred']].apply(pd.to_datetime, errors='coerce')
    mainDF_chunk[['DRNumber', 'AreaID', 'ReportingDistrict', 'CrimeCode', 'VictimAge', 'PremiseCode', 'WeaponUsedCode',
                  'CrimeCode1']].apply(pd.to_numeric)
    mainDF_chunk.drop(['CrimeCode2', 'CrimeCode3', 'CrimeCode4'], inplace=True, axis=1)
    print('Dimension of main DF group:' + str(mainDF_chunk.shape))

    # Add Tract from FCC geo AND violent bool
    
    mainDF_chunk['Tract'] = mainDF_chunk.Location.map(lambda x: getTract(x))

    df_group = pd.merge(mainDF_chunk, tractDF, how='left', on='Tract', sort=True)
    # print(df_group.head(5))

    df = df.append(df_group, sort=True)

    # print(df.head(3))

    itlu += 1
    itld += 1

df.to_csv("CrimeR_Tract.csv", sep=',', encoding='utf-8', header = True)