In [16]:
import os
import json
import math
import time
import cPickle as pickle
import multiprocessing as mp

import reverse_geocoder as rg
import pandas as pd
import numpy as np
import fiona
import shapely.geometry

## Read in data

In [2]:
def read_data(directory, file_name, header_names, columns_types):
    """
    Check to see if csv file exists, if not read in and parse dat file
    
    Inputs:
        directory (string): directory where data is stored
        file_name (string): file name WITHOUT extension
        header_names (list of strings): list of strings representing names for each column
        column_types (dict of numpy dtype objects): dict of column types. used to typecast columns. keys must be in header_names
        
    Output:
        (pandas dataframe): dataframe with formatted columns 
    """
    if os.path.isfile(directory + file_name + '.pickle'):
        print 'Reading pickled pandas DF: ' + directory + file_name + '.pickle'
        
        # read in pickled data frame 
        with open(directory + file_name + '.pickle', 'rb') as input_file:
            raw_data_df = pickle.load(input_file)
    else:
        print 'Reading raw txt file: ' + directory + file_name + '.txt'
        
        # read in file and format columns
        raw_data_df = pd.read_table(directory + file_name + '.txt', \
                                    names=header_names, \
                                    engine='c')
        
        for k in columns_types:
            v = columns_types[k]
            raw_data_df[k] = raw_data_df[k].astype(v)
        
    # return data
    return raw_data_df

In [3]:
# read in each data frame
checkin_df = read_data('./raw_data/', 
                       'dataset_TIST2015_Checkins', 
                       ['user_id', 'venue_id', 'utc_time', 'timezone_offset'],
                       {'user_id': np.str, 'venue_id': np.str, 'utc_time': np.str, 'timezone_offset': np.int64})

cities_df = read_data('./raw_data/', 
                      'dataset_TIST2015_Cities', 
                      ['city_name', 'latitude', 'longitude', 'country_code', 'country_name', 'city_type'],
                      {'city_name': np.str, 'latitude': np.float, 'longitude': np.float, 'country_code': np.str, 'country_name': np.str, 'city_type': np.str})

venues_df = read_data('./raw_data/', 
                      'dataset_TIST2015_POIs', 
                      ['venue_id', 'latitude', 'longitude', 'venue_category', 'country_code'],
                      {'venue_id': np.str, 'latitude': np.float, 'longitude': np.float, 'venue_category': np.str, 'country_code': np.str})

Reading raw txt file: ./raw_data/dataset_TIST2015_Checkins.txt
Reading raw txt file: ./raw_data/dataset_TIST2015_Cities.txt
Reading raw txt file: ./raw_data/dataset_TIST2015_POIs.txt


## Data cleaning

### Select venues in US only

In [4]:
# assign cities and countries
current_reverse_geocode = rg.search(zip(venues_df['latitude'], venues_df['longitude']))
cities = [x['name'] for x in current_reverse_geocode]
states = [x['admin1'] for x in current_reverse_geocode]
countries = [x['cc'] for x in current_reverse_geocode]

venues_df['city'] = cities
venues_df['state'] = states
venues_df['country'] = countries

Loading formatted geocoded file...


In [5]:
# select venues where country are in US
venues_df = venues_df[venues_df['country'] == 'US']

# select cities where country is US
cities_df = cities_df[cities_df['country_code'] == 'US']

# select checkins that are only for venues in US 
checkin_df = checkin_df[checkin_df['venue_id'].isin(venues_df['venue_id'])]

In [6]:
checkin_by_user = checkin_df[['user_id', 'venue_id']].merge(venues_df[['venue_id', 'venue_category', 'latitude', 'longitude','city', 'state', 'country']], \
                                                            on='venue_id',
                                                            how='left').reset_index()

### Select only Chicago, San Francisco, and Manhattan for city since those are what we will focus on for neighborhoods

In [7]:
checkin_by_user = checkin_by_user[checkin_by_user['city'].isin(['Chicago', 'San Francisco', 'Manhattan'])]

In [8]:
checkin_by_user

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US
8,8,44228,4a95f126f964a520952520e3,Train Station,41.875497,-87.649484,Chicago,Illinois,US
25,25,144940,4ec6e5e55c5ce271bce067fc,City Hall,37.779663,-122.417070,San Francisco,California,US
40,40,198359,49edce76f964a520f9671fe3,Office,40.755881,-73.985778,Manhattan,New York,US
47,47,139980,45840a3af964a520903f1fe3,Zoo,41.921740,-87.633758,Chicago,Illinois,US
68,68,201081,4de0117c45dd3eae8764d6ac,Tech Startup,37.781213,-122.402973,San Francisco,California,US
69,69,71374,49c1ee05f964a520e0551fe3,Office,37.787635,-122.399866,San Francisco,California,US
84,84,132121,4d0cc47f903d37041864bf55,Bus Line,40.779422,-73.955341,Manhattan,New York,US
85,85,139985,4297b480f964a52062241fe3,Science Museum,40.781184,-73.973203,Manhattan,New York,US
88,88,116934,4f7b2078e4b0d859b691c07b,Meeting Room,37.774383,-122.399203,San Francisco,California,US


## Get neighborhoods for each checkin

### Create shape arrays from neighborhood shape data

In [9]:
ca_collection = fiona.open('./shape_files/ZillowNeighborhoods-CA.shx')
il_collection = fiona.open('./shape_files/ZillowNeighborhoods-il.shx')
ny_collection = fiona.open('./shape_files/ZillowNeighborhoods-ny.shx')

In [10]:
ca_shapes = [(shapefile_record['properties']['Name'], shapely.geometry.asShape(shapefile_record['geometry'])) for shapefile_record in ca_collection]
il_shapes = [(shapefile_record['properties']['Name'], shapely.geometry.asShape(shapefile_record['geometry'])) for shapefile_record in il_collection]
ny_shapes = [(shapefile_record['properties']['Name'], shapely.geometry.asShape(shapefile_record['geometry'])) for shapefile_record in ny_collection]

### Find neighborhood in parallel

In [11]:
def get_neighborhood(row):
    """
    Uses shape files to get neighborhood of current row
    
    Inputs:
        row (tuple of 3 elements): contains (lat, long, city) where lat, long are float and city is string
        
    Output:
        (string): neighborhood name or None
    """
    latitude = row[0]
    longitude = row[1]
    city = row[2]
    
    # create point
    point = shapely.geometry.Point(longitude, latitude) # longitude, latitude
    
    # figure out what shapefile to use
    curr_shapefile = None
    if city == 'Chicago':
        curr_shapefile = il_shapes
    elif city == 'San Francisco':
        curr_shapefile = ca_shapes
    elif city == 'Manhattan':
        curr_shapefile = ny_shapes
    else:
        return None
    
    # find neighborhood
    for shape in curr_shapefile:
        if point.within(shape[1]):
            return shape[0]
    
    # if not found, return None
    return None

In [26]:
# format data for input
data = zip(checkin_by_user['latitude'], checkin_by_user['longitude'], checkin_by_user['city'])

# run code in parallel
t0 = time.time()
N_CORES = mp.cpu_count()
pool = mp.Pool(N_CORES)
r = pool.map(get_neighborhood, data)
t1 = time.time()

# time code
print str(t1 - t0) + ' seconds for ' + str(len(data)) + ' rows'

26020.8534 seconds for 305524 rows


In [30]:
checkin_by_user['neighborhood'] = r
checkin_by_user

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country,neighborhood
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US,The Loop
8,8,44228,4a95f126f964a520952520e3,Train Station,41.875497,-87.649484,Chicago,Illinois,US,University Village - Little Italy
25,25,144940,4ec6e5e55c5ce271bce067fc,City Hall,37.779663,-122.417070,San Francisco,California,US,Van Ness - Civic Center
40,40,198359,49edce76f964a520f9671fe3,Office,40.755881,-73.985778,Manhattan,New York,US,Midtown
47,47,139980,45840a3af964a520903f1fe3,Zoo,41.921740,-87.633758,Chicago,Illinois,US,Lincoln Park
68,68,201081,4de0117c45dd3eae8764d6ac,Tech Startup,37.781213,-122.402973,San Francisco,California,US,Yerba Buena
69,69,71374,49c1ee05f964a520e0551fe3,Office,37.787635,-122.399866,San Francisco,California,US,Yerba Buena
84,84,132121,4d0cc47f903d37041864bf55,Bus Line,40.779422,-73.955341,Manhattan,New York,US,Upper East Side
85,85,139985,4297b480f964a52062241fe3,Science Museum,40.781184,-73.973203,Manhattan,New York,US,Upper West Side
88,88,116934,4f7b2078e4b0d859b691c07b,Meeting Room,37.774383,-122.399203,San Francisco,California,US,South of Market


In [32]:
checkin_by_user.to_csv('./processed_data/checking-with-neighborhoods.csv', index=False)