## Imports and common functions

In [118]:
import seaborn as sb
import pandas as pd
import numpy as np
import reverse_geocoder as rg

In [25]:
def load_dat_file(file_name, n_components):  
    """
    Loads in .dat files as lists. 
    
    Inputs: 
        file_name (string): name of file to load in
        n_components (int): number of componenets in row
        
    Outputs:
        (list): list of rows in data file
    """
    new_lines = []
    with open(file_name) as input_file:
        for line in input_file:
            new_line = [x.strip() for x in line.split('|')]
            
            if len(new_line) == n_components:
                new_lines.append(new_line)
    return new_lines

## Load in data

In [16]:
# load in data files
raw_checkin_data = load_dat_file('./raw_data/checkins.dat', 6)
raw_ratings_data = load_dat_file('./raw_data/ratings.dat', 3)
raw_socialgraph_data = load_dat_file('./raw_data/socialgraph.dat', 2)
raw_user_data = load_dat_file('./raw_data/users.dat', 3)
raw_venues_data = load_dat_file('./raw_data/venues.dat', 3)

In [23]:
# convert above data to pandas for easy analysis 
checkin_df = pd.DataFrame(raw_checkin_data[1:], columns=raw_checkin_data[0])
ratings_df = pd.DataFrame(raw_ratings_data[1:], columns=raw_ratings_data[0])
socialgraph_df = pd.DataFrame(raw_socialgraph_data[1:], columns=raw_socialgraph_data[0])
user_df = pd.DataFrame(raw_user_data[1:], columns=raw_user_data[0])
venues_df = pd.DataFrame(raw_venues_data[1:], columns=raw_venues_data[0])

In [24]:
# export data as csv 
checkin_df.to_csv('./raw_data/checkin.csv', index=False)
ratings_df.to_csv('./raw_data/ratings.csv', index=False)
socialgraph_df.to_csv('./raw_data/socialgraph.csv', index=False)
user_df.to_csv('./raw_data/user.csv', index=False)
venues_df.to_csv('./raw_data/venues.csv', index=False)

## Data Cleaning

In [114]:
venues_df = venues_df[(venues_df['latitude'] != '') & (venues_df['longitude'] != '')]
venues_df['latitude'] = pd.to_numeric(venues_df['latitude'])
venues_df['longitude'] = pd.to_numeric(venues_df['longitude'])

In [115]:
# bound box to (roughly) united states only
top_left = (48.313833, -126.350714)
bottom_left = (24.489072, -126.350714)
top_right = (48.313833, -67.288219)
bottom_right = (24.489072, -67.288219)

subset_venues_df = venues_df[(venues_df['latitude'] < top_left[0]) & (venues_df['latitude'] > bottom_right[0]) &
                             (venues_df['longitude'] > top_left[1]) & (venues_df['longitude'] < bottom_right[1])]

In [None]:
subset_venues_df['city'] = ''
subset_venues_df['country'] = ''

for index, row in subset_venues_df.iterrows():
    current_reverse_geocode = rg.search((row['latitude'], row['longitude']))[0]
    subset_venues_df.set_value(index, 'city', current_reverse_geocode['name'])
    subset_venues_df.set_value(index, 'country', current_reverse_geocode['cc'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
