## Trending Places in SF based on time
<b><H10>Objective: </H10></b> I am trying to predict in what areas of SF will be trending during certain days/times based on foursquare data<p>

In [133]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn import tree
import seaborn as sns
import statsmodels.formula.api as smf
import boto
from boto.s3.connection import S3Connection
import json
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

%matplotlib inline  

In [2]:
#GET S3 Connection Ready

In [3]:
#creating the dataframe for data
def create_dataframe(text_file):
    parsed_data = []
    for record in range(len(text_file['venues'])):
        current_record = text_file['venues'][record]
        values = {
        'venue_name': current_record['name'],
        'checkin_count': current_record['hereNow']['count'],
        'postal_code': current_record['location'].get('postalCode', '00000'),
        'lat': current_record['location']['lat'],
        'lng': current_record['location']['lng'],
        'tips_count': current_record['stats']['tipCount'],
        'lifetime_checkins': current_record['stats']['checkinsCount'],
        'lifetime_unique_users': current_record['stats']['usersCount'],
        'category': current_record['categories'][0]['shortName']}
        parsed_data.append(values)
    return pd.DataFrame(parsed_data)

In [4]:
#Read Files into dataframes
key_names = []
for key in bucket.list():
   key_names.append(key.name.encode('utf-8'))

dataframes  = []

for key in key_names:
    a_file = bucket.get_key(key)
    file_name = '/foursquare_files/foursquarefile.txt'
    a_file.get_contents_to_filename(file_name)
    text = open(file_name).read()
    text = json.loads(text)
    temp_df = create_dataframe(text)
    temp_df['datetime'] = key
    dataframes.append(temp_df)

In [5]:
#cleaning up data
final_Df = pd.concat(dataframes)
final_Df['datetime'] = final_Df['datetime'].str.replace('.txt','')
final_Df['datetime'] = final_Df['datetime'].str.replace(':T:',' ')
final_Df['category']=final_Df['category'].map(lambda x: x.encode('utf-8'))
final_Df['datetime'] = pd.to_datetime(final_Df['datetime'])
final_Df['time_bucket']=final_Df['datetime'].map(lambda x: x.isoformat()[11:-6])

In [None]:
df=final_Df

## CLUSTERING

In [None]:
# perform clustering with 2 clusters
places_cluster = KMeans(n_clusters=2, init='random')
places_cluster.fit(df.drop(df.columns[[0,2,3,6,9,10,11,12,13,14]], axis=1))
y_kmeans = places_cluster.predict(df.drop(df.columns[[0,2,3,6,9,10,11,12,13,14]], axis=1))

In [None]:
metrics.silhouette_score(df.drop(df.columns[[0,2,3,6,9,10,11,12,13,14]], axis=1), places_cluster.labels_, metric='euclidean')  

In [None]:
# perform k means with up to 15 clusters
k_rng = range(1,15)
est = [KMeans(n_clusters = k).fit(df.drop(df.columns[[0,2,3,6,9,10,11,12,13,14]],axis=1)) for k in k_rng]

In [None]:
plt.figure(figsize=(7, 8))
plt.subplot(211)
plt.title('Using the elbow method to inform k choice')
plt.plot(k_rng[1:], silhouette_score, 'b*-')
plt.xlim([1,15])
plt.grid(True)
plt.ylabel('Silhouette Coefficient')
plt.plot(2,silhouette_score[0], 'o', markersize=12, markeredgewidth=1.5,
markerfacecolor='None', markeredgecolor='r')

In [8]:
#~100 categories turned to ~10 instead
unique_categories = list(df.category.unique())
mapping = {"Building":"Office","Gym / Fitness":"Gym","Cocktail":"Bar","Mexican":"Restaurant","Event Space":"Events", 
           "Dive Bar":"Bar","City":"Other","Cycle Studio":"Gym","Tech Startup":"Office","Non-Profit":"Office",
            "Plaza":"Other","Cineplex":"Movies","Steakhouse":"Restaurant","Lounge":"Bar","Bowling Alley":"Other",
            "Chinese":"Restaurant","Asian":"Restaurant","Italian":"Restaurant","Yogurt":"Restaurant","Arepas":"Restaurant",
            "Sushi":"Restaurant","Speakeasy":"Bar","New American":"Restaurant","Nightclub":"Bar","Wine Bar":"Bar",
            "Music Venue":"Events","Coffee Shop":"Coffee","Concert Hall":"Events","Food Truck":"Restaurant","Park":"Other",
            "American":"Restaurant","Mall":"Store","Street Food Gathering":"Restaurant","Japanese":"Restaurant",
            "Food Court":"Restaurant","Beer Garden":"Bar","Southern / Soul":"Restaurant","Theater":"Movies",
            "Gastropub":"Bar","Tacos":"Restaurant","Apparel":"Store","Pizza":"Restaurant","Hawaiian":"Restaurant",
            'Peruvian':'Restaurant','Other Event':'Other','Movie Theater':'Movies','Ice Cream':'Restaurant',
            'Souvlaki':'Restaurant','Hotel':'Other','Food & Drink':'Restaurant','Bubble Tea':'Restaurant','Art Gallery':'Museum','Noodles':'Restaurant',
            'Grocery Store':'','Burgers':'Restaurant','City Hall':'Other','Bakery':'Restaurant',
            'Breakfast':'Restaurant','Thai':'Restaurant','Sandwiches':'Restaurant','Electronics':'Store',
            'Neighborhood':'Other',
            'Rock Club':'Other','Gay Bar':'Bar','Karaoke':'Bar','Festival':'Events','Warehouse Store':'Store',
            'Salon / Barbershop':'Salon','Indian':'Restaurant','Supermarket':'Store','Department Store':'Store',
            'Art Museum':'Museum','Museum':'Museum',"Women's Store":'Store','Tea Room':'Restaurant',
            'Residential':'Other','Hotel Bar':'Bar','Diner':'Restaurant','Convention Center':'Events',
            'Synagogue':'Other','Smoothie Shop':'Restaurant','Fried Chicken':'Restaurant','Sporting Goods':'Store',
            'French':'Restaurant','Organic Grocery':'Store','Donuts':'Restaurant',
            'Yoga Studio':'Gym','Ramen':'Restaurant','Dance Studio':'Gym','BBQ':'Restaurant','Vietnamese':'Restaurant',
            'Wings':'Restaurant','Whisky Bar':'Bar','Cantonese':'Restaurant','Coworking Space':'Other'
          ,'Sports Bar':'Bar','Office':'Office','Brewery':'Bar','Train Station':'Other','Pub':'Bar'
          ,'Winery':'Bar','Beer Store':'Bar','Caf\xc3\xa9':'Coffee', "Bar":"Bar"}


df['mapped_category'] = df['category'].apply(lambda x: mapping[x])
mapped_category_ints = {'Other':1, 'Office':2, 'Gym':3, 'Bar':4, 'Restaurant':5, 'Events':6, 'Movies':7,
       'Coffee':8, 'Store':9, 'Museum':10, '':11, 'Salon':12}
df['mapped_category_ints'] = df['mapped_category'].apply(lambda x: mapped_category_ints[x])

In [None]:
#look at in excel
writer = pd.ExcelWriter('output1.xlsx')
df.to_excel(writer, 'Sheet1')
writer.save()

In [11]:
df['category']=df['category'].map(lambda x: x.decode('utf-8'))

In [24]:
from sklearn.cross_validation import cross_val_score

In [115]:
output = pd.read_csv('/hw/output1.csv')

In [117]:
output['mapped_week']=output['week'].map({'Sunday':0, 'Monday':1,'Tuesday':2,'Wednesday':3
                                          ,'Thursday':4,'Friday':5,'Saturday':6})

In [124]:
#dummy variable
output['time_week']=output.time_bucket + output.mapped_week

In [121]:
output.head(1)

Unnamed: 0.1,Unnamed: 0,category,checkin_count,datetime,lat,lifetime_checkins,lifetime_unique_users,lng,postal_code,Districts,...,venue_name,time_bucket,simple_time,mapped_category,mapped_category_ints,predictions,week,simple_time_1,mapped_week,time_week
0,0,City,18,2015-07-28 19:22:00,37.773836,88323,31292,-122.419624,94103,Mission,...,City of San Francisco,19,afternoon,Other,1,1,Tuesday,0,2,2


In [128]:
from sklearn.grid_search import GridSearchCV
depth_range = range(1, 3)
criterion_range = ['gini', 'entropy']
max_feature_range = range(1,3)
param_grid = dict(max_depth=depth_range, criterion=criterion_range, max_features=max_feature_range)
grid = GridSearchCV(ctree, param_grid, cv=5, scoring='accuracy', verbose=0)
features = ['time_bucket', 'mapped_week', 'time_week']
X = output[features]
y = output['Districts']
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best'),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'max_features': [1, 2], 'criterion': ['gini', 'entropy'], 'max_depth': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, score_func=None,
       scoring='accuracy', verbose=0)

In [126]:
ctree = tree.DecisionTreeClassifier()
ctree.fit(X, y)
ctree.feature_importances_

array([ 0.37442683,  0.37721474,  0.24835844])

In [129]:
cross_val_score(ctree, X, y, cv=10, scoring='accuracy').mean()

0.14376418187572071

In [103]:
from sklearn import svm, linear_model, datasets
clf = svm.SVC()
clf.fit(X,y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=0.0,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [123]:
cross_val_score(clf, X, y, cv=10, scoring='accuracy').mean()

0.27452536028245211

In [113]:
from sklearn.ensemble import RandomForestClassifier
rfclf = RandomForestClassifier(n_estimators=100, max_features='auto', oob_score=True, random_state=1)
rfclf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=1, verbose=0, warm_start=False)

In [93]:
rfclf.oob_score_

0.46024799416484319

In [114]:
cross_val_score(rfclf, X, y, cv=10, scoring='accuracy').mean()

0.15840052727642884