## Imports and common functions

In [3]:
import os
import json
import cPickle as pickle

# analysis
import pandas as pd
import numpy as np
import reverse_geocoder as rg

# plotting
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Load in data
Using a mini dataset bc my computer is weak

In [1]:
def read_data(directory, file_name, header_names, columns_types):
    """
    Check to see if csv file exists, if not read in and parse dat file
    
    Inputs:
        directory (string): directory where data is stored
        file_name (string): file name WITHOUT extension
        header_names (list of strings): list of strings representing names for each column
        column_types (dict of numpy dtype objects): dict of column types. used to typecast columns. keys must be in header_names
        
    Output:
        (pandas dataframe): dataframe with formatted columns 
    """
    if os.path.isfile(directory + file_name + '.pickle'):
        print 'Reading pickled pandas DF: ' + directory + file_name + '.pickle'
        
        # read in pickled data frame 
        with open(directory + file_name + '.pickle', 'rb') as input_file:
            raw_data_df = pickle.load(input_file)
    else:
        print 'Reading raw txt file: ' + directory + file_name + '.txt'
        
        # read in file and format columns
        raw_data_df = pd.read_table(directory + file_name + '.txt', \
                                    names=header_names, \
                                    engine='c')
        raw_data_df = raw_data_df.astype(columns_types)
        
        # export file as pickled object
        with open(directory + file_name + '.pickle', 'wb') as output_file:
            pickle.dump(raw_data_df, output_file, pickle.HIGHEST_PROTOCOL)
        
    # return data
    return raw_data_df

In [88]:
# read in each data frame
checkin_df = read_data('./raw_data/', 
                       'jennie_full', 
                       ['user_id', 'venue_id', 'utc_time', 'timezone_offset'],
                       {'user_id': np.str, 'venue_id': np.str, 'utc_time': np.str, 'timezone_offset': np.int64})


Reading pickled pandas DF: ./raw_data/jennie_full.pickle


## Data Cleaning

In [89]:
checkin_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country
0,0,221021,4a85b1b3f964a520eefe1fe3,Coffee Shop,40.748939,-73.992280,Weehawken,New Jersey,US
1,1,66981,4b4606f2f964a520751426e3,Office,30.270753,-97.752936,Austin,Texas,US
2,2,28761,4b4bade2f964a520cfa326e3,College Arts Building,40.436712,-79.990132,Pittsburgh,Pennsylvania,US
3,3,39350,49bbd6c0f964a520f4531fe3,Arts & Crafts Store,40.719726,-74.002472,New York City,New York,US
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US
5,5,1702,4bbf31c9f353d13a942b7e10,Hospital,33.752959,-84.382211,Atlanta,Georgia,US
6,6,84752,4b749db3f964a520f9e72de3,Coffee Shop,47.614292,-122.341916,Seattle,Washington,US
7,7,91828,4c029902310fc9b6e2a7c461,Miscellaneous Shop,43.032290,-78.698502,Clarence Center,New York,US
8,8,44228,4a95f126f964a520952520e3,Train Station,41.875497,-87.649484,Chicago,Illinois,US
9,9,32655,4a43c0aef964a520c6a61fe3,Bridge,40.606800,-74.044170,Bensonhurst,New York,US


In [31]:
#['city'].unique()}).reset_index()

venue_categories = checkin_df['venue_category'].unique() 



# unique_cities_df['unique_venues_count'] = unique_cities_df.unique_venues.apply(len)
# unique_cities_df['unique_cities_count'] = unique_cities_df.unique_cities.apply(len)
# unique_cities_df.sort_values(['unique_cities_count'], ascending=[False])


In [33]:
checkin_df['venue_category']

0                                   Coffee Shop
1                                        Office
2                         College Arts Building
3                           Arts & Crafts Store
4                               Conference Room
5                                      Hospital
6                                   Coffee Shop
7                            Miscellaneous Shop
8                                 Train Station
9                                        Bridge
10                                   Food Truck
11                                Spa / Massage
12                               Student Center
13                                Design Studio
14                              Automotive Shop
15                             Department Store
16                     College Science Building
17                             Asian Restaurant
18                            Convenience Store
19                                 Burger Joint
20                                      

In [28]:
categories_df = pd.DataFrame(checkin_df.groupby('venue_category').index.nunique()).reset_index()

categories_df.columns = ['venue_category', 'num_checkins']

In [35]:
categories_df.sort_values(['num_checkins'], ascending=[False])

Unnamed: 0,venue_category,num_checkins
36,Coffee Shop,48
107,Home (private),42
140,Office,41
100,Grocery Store,27
164,Sandwich Place,25
130,Mexican Restaurant,24
24,Burger Joint,20
0,Airport,19
23,Building,19
125,Mall,18


In [115]:
import fiona
import shapely.geometry

In [157]:
tiny_checkin_sf_df = checkin_df[checkin_df['city']=='San Francisco']
tiny_checkin_chi_df = checkin_df[checkin_df['city']=='Chicago']
tiny_checkin_nyc_df = checkin_df[checkin_df['city']=='Manhattan']

In [None]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-CA.shx')
def getNeighborhood(x):
    lat = x[4]
    lon = x[5]
    point = shapely.geometry.Point(lon, lat) # longitude, latitude

    for shapefile_record in fiona_collection:

         # Use Shapely to create the polygon
        shape = shapely.geometry.asShape(shapefile_record['geometry'])

        if point.within(shape):
            try:
                return shapefile_record['properties']['Name']
            except:
                return None
        
tiny_checkin_sf_df['neighborhood'] = tiny_checkin_sf_df.apply(getNeighborhood, axis=1)

In [156]:
tiny_checkin_sf_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country,neighborhood
25,25,144940,4ec6e5e55c5ce271bce067fc,City Hall,37.779663,-122.417070,San Francisco,California,US,Van Ness - Civic Center
68,68,201081,4de0117c45dd3eae8764d6ac,Tech Startup,37.781213,-122.402973,San Francisco,California,US,Yerba Buena
69,69,71374,49c1ee05f964a520e0551fe3,Office,37.787635,-122.399866,San Francisco,California,US,Yerba Buena
88,88,116934,4f7b2078e4b0d859b691c07b,Meeting Room,37.774383,-122.399203,San Francisco,California,US,South of Market
136,136,122232,461937cff964a5205c451fe3,Government Building,37.773227,-122.440456,San Francisco,California,US,North Panhandle
156,156,31494,4b7f3359f964a520a01e30e3,Shoe Store,37.789381,-122.407501,San Francisco,California,US,Downtown
197,197,144942,42bf4180f964a520b1251fe3,Hotel,37.787581,-122.408799,San Francisco,California,US,Downtown
261,261,195770,4f7b115ee4b02f1b2cffd486,Event Space,37.794255,-122.400432,San Francisco,California,US,Financial District
325,325,215037,437e6b00f964a520b72a1fe3,Ice Cream Shop,37.805645,-122.422375,San Francisco,California,US,Russian Hill
380,380,93669,4db4b3894b226b343d92deb4,Park,37.767881,-122.441488,San Francisco,California,US,Buena Vista Park


In [147]:
tiny_checkin_nyc_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country,neighborhood
40,40,198359,49edce76f964a520f9671fe3,Office,40.755881,-73.985778,Manhattan,New York,US,Midtown
84,84,132121,4d0cc47f903d37041864bf55,Bus Line,40.779422,-73.955341,Manhattan,New York,US,Upper East Side
85,85,139985,4297b480f964a52062241fe3,Science Museum,40.781184,-73.973203,Manhattan,New York,US,Upper West Side
213,213,47124,4a689777f964a520a0ca1fe3,Restaurant,40.755275,-73.978806,Manhattan,New York,US,Midtown
268,268,21011,4bc308db4cdfc9b639bf9621,Dog Run,40.790606,-73.980521,Manhattan,New York,US,Upper West Side
343,343,217977,4b560838f964a520d5fc27e3,Sandwich Place,40.759849,-73.972124,Manhattan,New York,US,Midtown
351,351,7381,4ad607d3f964a520a10421e3,Bank,40.778252,-73.954629,Manhattan,New York,US,Upper East Side
392,392,54351,4b7718d4f964a520077d2ee3,Sporting Goods Shop,40.779014,-73.954039,Manhattan,New York,US,Upper East Side
456,456,7381,4ca66eec5a1e952129d98ace,Clothing Store,40.778439,-73.954515,Manhattan,New York,US,Upper East Side
487,487,37893,447bf8f1f964a520ec331fe3,Electronics Store,40.763823,-73.97301,Manhattan,New York,US,Midtown


In [None]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-NY.shx')
        
tiny_checkin_nyc_df['neighborhood'] = tiny_checkin_nyc_df.apply(getNeighborhood, axis=1)

In [150]:
tiny_checkin_chi_df

Unnamed: 0,index,user_id,venue_id,venue_category,latitude,longitude,city,state,country,neighborhood
4,4,163570,4b2277b1f964a5203f4724e3,Conference Room,41.886485,-87.623692,Chicago,Illinois,US,The Loop
8,8,44228,4a95f126f964a520952520e3,Train Station,41.875497,-87.649484,Chicago,Illinois,US,University Village - Little Italy
47,47,139980,45840a3af964a520903f1fe3,Zoo,41.92174,-87.633758,Chicago,Illinois,US,Lincoln Park
130,130,106158,4efd482c9adffb90ca930cb1,Mexican Restaurant,41.896599,-87.632175,Chicago,Illinois,US,River North
139,139,251458,49bac1c8f964a520a0531fe3,Sandwich Place,41.867852,-87.642037,Chicago,Illinois,US,South Loop
141,141,94652,4b8043c3f964a5204f6230e3,College Residence Hall,41.874858,-87.627312,Chicago,Illinois,US,South Loop
146,146,91831,4b7087c7f964a52020202de3,College Library,41.871955,-87.650524,Chicago,Illinois,US,University Village - Little Italy
168,168,41570,4b32ad2cf964a520a11125e3,Sushi Restaurant,41.890809,-87.631717,Chicago,Illinois,US,River North
220,220,2680,4a823e8af964a520fcf81fe3,Train Station,41.925171,-87.65289,Chicago,Illinois,US,Sheffield Neighbors
394,394,44228,4a881bc2f964a5202d0520e3,Subway,41.896651,-87.62816,Chicago,Illinois,US,Near North


In [None]:
fiona_collection = fiona.open('./raw_data/ZillowNeighborhoods-IL.shx')
        
tiny_checkin_chi_df['neighborhood'] = tiny_checkin_chi_df.apply(getNeighborhood, axis=1)