## Imports and common functions

In [7]:
import os
import json
import cPickle as pickle

# analysis
import pandas as pd
import numpy as np
import reverse_geocoder as rg

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in data
Using a mini dataset bc my computer is weak

In [8]:
def read_data(directory, file_name, header_names, columns_types):
    """
    Check to see if csv file exists, if not read in and parse dat file
    
    Inputs:
        directory (string): directory where data is stored
        file_name (string): file name WITHOUT extension
        header_names (list of strings): list of strings representing names for each column
        column_types (dict of numpy dtype objects): dict of column types. used to typecast columns. keys must be in header_names
        
    Output:
        (pandas dataframe): dataframe with formatted columns 
    """
    if os.path.isfile(directory + file_name + '.pickle'):
        print 'Reading pickled pandas DF: ' + directory + file_name + '.pickle'
        
        # read in pickled data frame 
        with open(directory + file_name + '.pickle', 'rb') as input_file:
            raw_data_df = pickle.load(input_file)
    else:
        print 'Reading raw txt file: ' + directory + file_name + '.txt'
        
        # read in file and format columns
        raw_data_df = pd.read_table(directory + file_name + '.txt', \
                                    names=header_names, \
                                    engine='c')
        raw_data_df = raw_data_df.astype(columns_types)
        
        # export file as pickled object
        with open(directory + file_name + '.pickle', 'wb') as output_file:
            pickle.dump(raw_data_df, output_file, pickle.HIGHEST_PROTOCOL)
        
    # return data
    return raw_data_df

In [9]:
# read in each data frame
checkin_df = read_data('./raw_data/', 
                       'jennie_full', 
                       ['user_id', 'venue_id', 'utc_time', 'timezone_offset'],
                       {'user_id': np.str, 'venue_id': np.str, 'utc_time': np.str, 'timezone_offset': np.int64})


Reading pickled pandas DF: ./raw_data/jennie_full.pickle


## Data Cleaning

In [10]:
checkin_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
0,0,221021,4a85b1b3f964a520eefe1fe3,Coffee Shop,40.748939,-73.992280,Weehawken,New Jersey,US
1,1,66981,4b4606f2f964a520751426e3,Office,30.270753,-97.752936,Austin,Texas,US
2,2,28761,4b4bade2f964a520cfa326e3,College Arts Building,40.436712,-79.990132,Pittsburgh,Pennsylvania,US
3,3,39350,49bbd6c0f964a520f4531fe3,Arts & Crafts Store,40.719726,-74.002472,New York City,New York,US
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US
5,5,1702,4bbf31c9f353d13a942b7e10,Hospital,33.752959,-84.382211,Atlanta,Georgia,US
6,6,84752,4b749db3f964a520f9e72de3,Coffee Shop,47.614292,-122.341916,Seattle,Washington,US
7,7,91828,4c029902310fc9b6e2a7c461,Miscellaneous Shop,43.032290,-78.698502,Clarence Center,New York,US
8,8,44228,4a95f126f964a520952520e3,Train Station,41.875497,-87.649484,Chicago,Illinois,US
9,9,32655,4a43c0aef964a520c6a61fe3,Bridge,40.606800,-74.044170,Bensonhurst,New York,US


In [31]:
#['city'].unique()}).reset_index()

venue_categories = checkin_df['venue_category'].unique() 



# unique_cities_df['unique_venues_count'] = unique_cities_df.unique_venues.apply(len)
# unique_cities_df['unique_cities_count'] = unique_cities_df.unique_cities.apply(len)
# unique_cities_df.sort_values(['unique_cities_count'], ascending=[False])


In [33]:
checkin_df['venue_category']

0                                   Coffee Shop
1                                        Office
2                         College Arts Building
3                           Arts & Crafts Store
4                               Conference Room
5                                      Hospital
6                                   Coffee Shop
7                            Miscellaneous Shop
8                                 Train Station
9                                        Bridge
10                                   Food Truck
11                                Spa / Massage
12                               Student Center
13                                Design Studio
14                              Automotive Shop
15                             Department Store
16                     College Science Building
17                             Asian Restaurant
18                            Convenience Store
19                                 Burger Joint
20                                      

In [28]:
categories_df = pd.DataFrame(checkin_df.groupby('venue_category').index.nunique()).reset_index()

categories_df.columns = ['venue_category', 'num_checkins']

In [35]:
categories_df.sort_values(['num_checkins'], ascending=[False])

Unnamed: 0,venue_category,num_checkins
36,Coffee Shop,48
107,Home (private),42
140,Office,41
100,Grocery Store,27
164,Sandwich Place,25
130,Mexican Restaurant,24
24,Burger Joint,20
0,Airport,19
23,Building,19
125,Mall,18


In [28]:
checkin_byuser_city = pd.DataFrame({'unique_location': checkin_df.groupby(['user_id'])['city'].unique()}).reset_index()

In [29]:
checkin_byuser_city

Unnamed: 0,user_id,unique_location
0,1,"[Franklin Square, Malverne, North New Hyde Par..."
1,10,"[New York City, Long Island City, Brooklyn, Ma..."
2,100,"[SeaTac, College Park, El Segundo, Miami Sprin..."
3,100001,"[Austin, Wells Branch, Onion Creek, Jollyville..."
4,100002,"[Washington, D.C., Overlea, Manhattan, Baltimo..."
5,100010,"[Safety Harbor, Saint George, Millbrae, San Fr..."
6,100011,"[El Segundo, Pasadena, San Diego, La Jolla, Lo..."
7,10002,"[Washington, D.C., Woodmore, Long Island City,..."
8,100022,"[Miami, Tampa, Cary, Dumbarton, Alexandria, Wa..."
9,100030,"[Kenner, Metairie, Elmwood, Westwego, Metairie..."


In [40]:
three_cities_df = checkin_byuser_city[checkin_byuser_city['unique_location'].astype(str).str.contains('San Francisco') & 
                  checkin_byuser_city['unique_location'].astype(str).str.contains('Chicago') &
                  checkin_byuser_city['unique_location'].astype(str).str.contains('Manhattan')]


In [55]:
three_cities_users = three_cities_df['user_id'].unique()
three_cities_users

array(['100994', '10134', '101736', '10280', '103140', '103937', '104219',
       '104688', '105285', '105395', '10592', '106773', '106898', '107444',
       '1083', '108580', '108961', '110170', '11134', '112400', '113450',
       '113726', '113894', '115787', '115944', '116183', '116332',
       '117013', '117259', '117261', '117475', '11797', '11823', '118624',
       '120055', '120106', '120302', '120413', '120421', '121443', '12238',
       '1228', '12295', '12436', '12442', '1246', '125122', '125155',
       '126839', '128442', '1295', '130526', '131095', '132355', '132602',
       '133717', '13516', '135299', '135780', '136011', '136993', '137044',
       '137232', '13807', '13818', '138815', '139089', '141277', '14256',
       '142602', '146736', '146795', '14803', '14829', '151', '151221',
       '151904', '152055', '152761', '15293', '155256', '155280', '15606',
       '156307', '15642', '156530', '15731', '158367', '158463', '15945',
       '159884', '160140', '160945', '161

In [57]:
three_cities_all_df = checkin_df[checkin_df['user_id'].isin(three_cities_users)]

In [58]:
three_cities_all_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US
208,208,1295,4ad78c9df964a520130c21e3,Student Center,43.075456,-87.881371,Shorewood,Wisconsin,US
282,282,2197,4f7b3e17e4b0d7b576d65f80,Clothing Store,34.065683,-118.411738,Century City,California,US
634,634,24083,4b0837a7f964a520810623e3,BBQ Joint,35.302073,-80.747796,Harrisburg,North Carolina,US
822,822,12436,4b61a594f964a520871b2ae3,College Academic Building,35.786784,-78.665687,West Raleigh,North Carolina,US
861,861,57307,482768e5f964a520a94f1fe3,Burger Joint,37.639124,-122.419356,San Bruno,California,US
924,924,4586,4c250727c11dc9b6857b2624,Pizza Place,47.658378,-122.312272,Seattle,Washington,US
949,949,65245,4a55c1f7f964a52046b41fe3,Airport,35.219724,-80.943936,Belmont,North Carolina,US
1490,1490,24083,4f71f780c2ee6565c2da35eb,Tech Startup,35.196221,-80.760949,Matthews,North Carolina,US
1512,1512,10592,4c81086adc018cfaa883c16c,Office,40.702537,-73.990581,New York City,New York,US


In [66]:
chi_df = three_cities_all_df[three_cities_all_df['city'] == 'Chicago']

In [67]:
sf_df = three_cities_all_df[three_cities_all_df['city'] == 'San Francisco']

In [68]:
nyc_df = three_cities_all_df[three_cities_all_df['city'] == 'New York City']

In [70]:
sf_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
2760,2760,93708,4f79aedce4b03c36a2cac393,Convention Center,37.783943,-122.402816,San Francisco,California,US
5340,5340,63531,44f9c96ef964a5206e381fe3,Subway,37.762571,-122.435203,San Francisco,California,US
5644,5644,63531,4b7d7193f964a520b1be2fe3,Spa / Massage,37.773674,-122.449430,San Francisco,California,US
6802,6802,63531,4a135140f964a520d3771fe3,Coffee Shop,37.797692,-122.430514,San Francisco,California,US
7263,7263,221150,4a00edd0f964a520c5701fe3,Hotel,37.785326,-122.405527,San Francisco,California,US
8019,8019,146795,49cfa966f964a520b45a1fe3,American Restaurant,37.786138,-122.410471,San Francisco,California,US
8783,8783,57307,4cdb5d134006a143a23cd8b2,Music Venue,37.755711,-122.428623,San Francisco,California,US
10682,10682,93708,43a9dcaef964a5207f2c1fe3,Bar,37.786948,-122.400200,San Francisco,California,US
11307,11307,221150,4a00edd0f964a520c5701fe3,Hotel,37.785326,-122.405527,San Francisco,California,US
11830,11830,13807,4b4d2622f964a520b0cc26e3,Hotel,37.805542,-122.416765,San Francisco,California,US


In [64]:
import fiona
import shapely.geometry

In [71]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-CA.shx')
def getNeighborhood(x):
    lat = x[4]
    lon = x[5]
    point = shapely.geometry.Point(lon, lat) # longitude, latitude

    for shapefile_record in fiona_collection:

         # Use Shapely to create the polygon
        shape = shapely.geometry.asShape(shapefile_record['geometry'])

        if point.within(shape):
            try:
                return shapefile_record['properties']['Name']
            except:
                return None
        
sf_df['neighborhood'] = sf_df.apply(getNeighborhood, axis=1)

KeyboardInterrupt: 

In [None]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-NY.shx')
        
nyc_df['neighborhood'] = nyc_df.apply(getNeighborhood, axis=1)

In [None]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-IL.shx')
        
chi_df['neighborhood'] = chi_df.apply(getNeighborhood, axis=1)

In [72]:
sf_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
2760,2760,93708,4f79aedce4b03c36a2cac393,Convention Center,37.783943,-122.402816,San Francisco,California,US
5340,5340,63531,44f9c96ef964a5206e381fe3,Subway,37.762571,-122.435203,San Francisco,California,US
5644,5644,63531,4b7d7193f964a520b1be2fe3,Spa / Massage,37.773674,-122.449430,San Francisco,California,US
6802,6802,63531,4a135140f964a520d3771fe3,Coffee Shop,37.797692,-122.430514,San Francisco,California,US
7263,7263,221150,4a00edd0f964a520c5701fe3,Hotel,37.785326,-122.405527,San Francisco,California,US
8019,8019,146795,49cfa966f964a520b45a1fe3,American Restaurant,37.786138,-122.410471,San Francisco,California,US
8783,8783,57307,4cdb5d134006a143a23cd8b2,Music Venue,37.755711,-122.428623,San Francisco,California,US
10682,10682,93708,43a9dcaef964a5207f2c1fe3,Bar,37.786948,-122.400200,San Francisco,California,US
11307,11307,221150,4a00edd0f964a520c5701fe3,Hotel,37.785326,-122.405527,San Francisco,California,US
11830,11830,13807,4b4d2622f964a520b0cc26e3,Hotel,37.805542,-122.416765,San Francisco,California,US
