In [1]:
import pickle
import pandas as pd
import datetime
import numpy as np

In [2]:
top_brands = pd.read_csv('data/top_brands_by_category.csv')
top_brands.head()

Unnamed: 0,SUB_CATEGORY,BRANDS,Total_Spend,Total_Visits,Unique_PLACEKEY_Count
0,"Cosmetics, Beauty Supplies, and Perfume Stores",Sephora,1913415.32,702927,15
1,"Cosmetics, Beauty Supplies, and Perfume Stores",ULTA Beauty,1339844.15,404546,20
2,Full-Service Restaurants,Olive Garden,1058844.15,125651,15
3,Full-Service Restaurants,The Cheesecake Factory,974090.35,473745,5
4,Warehouse Clubs and Supercenters,Target,41977753.97,2053582,50


In [3]:
focal_brands = top_brands['BRANDS'].to_list()
focal_brands

['Sephora',
 'ULTA Beauty',
 'Olive Garden',
 'The Cheesecake Factory',
 'Target',
 'Walmart',
 'Anthropologie',
 "Victoria's Secret"]

In [4]:
brands_catalog = pd.read_csv('data/catalog.tsv', sep='\t', header=None, names=['ID', 'Name', 'Genre', 'Type', 'Classification', 'Status'])
brands_catalog['Name_Standard'] = brands_catalog['Name'].apply(lambda x: x.strip().lower()) # For comparison with visitation_data.csv
brands_catalog.head()

Unnamed: 0,ID,Name,Genre,Type,Classification,Status,Name_Standard
0,1,+44 (Band),ART,Interscope-Geffen-A&M,Alternative,Inactive,+44 (band)
1,2,10 Years,ART,Unsigned,Rock,Active,10 years
2,3,12 Stones,ART,Concord Records,Rock,Active,12 stones
3,4,13th Floor Elevators,ART,International Artists,Rock,Inactive,13th floor elevators
4,5,2 Pistols,ART,Universal Motown Records,Hip Hop/Rap,Active,2 pistols


In [5]:
visitation_data = pd.read_csv('data/revision_visits_revenue_2019.csv')
visitation_data['brand_standard'] = visitation_data['brand'].apply(lambda x: x.strip().lower()) # For comparison with catalog.tsv
visitation_data['date'] = visitation_data['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())
visitation_data.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand,lat,lon,brand_standard
0,2019-06-01,zzw-222@62j-sgj-q2k,5,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
1,2019-06-02,zzw-222@62j-sgj-q2k,1,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
2,2019-06-03,zzw-222@62j-sgj-q2k,6,859.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
3,2019-06-04,zzw-222@62j-sgj-q2k,6,30.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
4,2019-06-05,zzw-222@62j-sgj-q2k,8,193.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness


In [6]:
visitation_data_merged = pd.merge(left=visitation_data, right=brands_catalog, how='left', left_on='brand_standard', right_on='Name_Standard')
visitation_data_merged = visitation_data_merged.drop(['brand_standard', 'Name_Standard'], axis=1)
visitation_data_merged.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand,lat,lon,ID,Name,Genre,Type,Classification,Status
0,2019-06-01,zzw-222@62j-sgj-q2k,5,0.0,Orangetheory Fitness,42.350592,-71.153024,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,
1,2019-06-02,zzw-222@62j-sgj-q2k,1,0.0,Orangetheory Fitness,42.350592,-71.153024,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,
2,2019-06-03,zzw-222@62j-sgj-q2k,6,859.0,Orangetheory Fitness,42.350592,-71.153024,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,
3,2019-06-04,zzw-222@62j-sgj-q2k,6,30.0,Orangetheory Fitness,42.350592,-71.153024,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,
4,2019-06-05,zzw-222@62j-sgj-q2k,8,193.0,Orangetheory Fitness,42.350592,-71.153024,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,


In [7]:
social_data = pd.read_csv('data/social.csv')
social_data['date'] = social_data['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())
social_data.head()

Unnamed: 0,id,num_review_fb,num_review_ig,num_review_tw,date,brand
0,9,5286.0,13737.0,18.0,2019-01-01,2Pac
1,11,1526.0,2559.0,210.0,2019-01-01,30 Seconds To Mars
2,12,20.0,604.0,17.0,2019-01-01,311 (Band)
3,23,1.0,1.0,0.0,2019-01-01,6ixth Sense
4,26,50.0,507.0,0.0,2019-01-01,8Ball & MJG


### Testing for a Brand

In [8]:
focal_brand = focal_brands[0]
print(focal_brand)

Sephora


In [9]:
focal_visits = visitation_data_merged[visitation_data_merged['brand'] == focal_brand]
focal_visits.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand,lat,lon,ID,Name,Genre,Type,Classification,Status
20070,2019-06-01,22f-222@62j-shx-fcq,78,221.06,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,
20071,2019-06-02,22f-222@62j-shx-fcq,49,573.31,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,
20072,2019-06-03,22f-222@62j-shx-fcq,44,2963.59,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,
20073,2019-06-04,22f-222@62j-shx-fcq,60,1120.24,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,
20074,2019-06-05,22f-222@62j-shx-fcq,63,921.26,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,


In [10]:
focal_social_data  = social_data[social_data['id'] == int(focal_visits['ID'].iloc[0])]
focal_social_data.head()

Unnamed: 0,id,num_review_fb,num_review_ig,num_review_tw,date,brand
1955,6780,8.0,36141.0,0.0,2019-01-01,Sephora
36793,6780,3655.0,49529.0,33.0,2019-01-02,Sephora
72773,6780,43.0,17887.0,26.0,2019-01-03,Sephora
108924,6780,10.0,33298.0,30.0,2019-01-04,Sephora
166063,6780,0.0,18389.0,15.0,2019-01-05,Sephora


In [11]:
focal_visits.loc[:,'visits_past_60_days'] = focal_visits.groupby(by=['lat', 'lon'], group_keys=False).apply(lambda x: x.sort_values('date').rolling(60)['visits_by_day'].sum().shift(1))

  focal_visits.loc[:,'visits_past_60_days'] = focal_visits.groupby(by=['lat', 'lon'], group_keys=False).apply(lambda x: x.sort_values('date').rolling(60)['visits_by_day'].sum().shift(1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  focal_visits.loc[:,'visits_past_60_days'] = focal_visits.groupby(by=['lat', 'lon'], group_keys=False).apply(lambda x: x.sort_values('date').rolling(60)['visits_by_day'].sum().shift(1))


In [12]:
focal_visits.loc[:,'total_visits_across_stores'] = focal_visits.groupby(by=['date'])['visits_past_60_days'].transform('sum')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  focal_visits.loc[:,'total_visits_across_stores'] = focal_visits.groupby(by=['date'])['visits_past_60_days'].transform('sum')


In [18]:
focal_visits.loc[:,'proportion_of_visits'] = focal_visits['visits_past_60_days']/focal_visits['total_visits_across_stores']

In [64]:
focal_visits.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand,lat,lon,ID,Name,Genre,Type,Classification,Status,visits_past_60_days,total_visits_across_stores,proportion_of_visits
20070,2019-06-01,22f-222@62j-shx-fcq,78,221.06,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,,2920.0,145706.0,0.02004
20071,2019-06-02,22f-222@62j-shx-fcq,49,573.31,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,,2960.0,146600.0,0.020191
20072,2019-06-03,22f-222@62j-shx-fcq,44,2963.59,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,,2976.0,146863.0,0.020264
20073,2019-06-04,22f-222@62j-shx-fcq,60,1120.24,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,,2971.0,147062.0,0.020202
20074,2019-06-05,22f-222@62j-shx-fcq,63,921.26,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,Retail,Cosmetics,,2975.0,146865.0,0.020257


In [65]:
focal_social_data.head()

Unnamed: 0,id,num_review_fb,num_review_ig,num_review_tw,date,brand
1955,6780,8.0,36141.0,0.0,2019-01-01,Sephora
36793,6780,3655.0,49529.0,33.0,2019-01-02,Sephora
72773,6780,43.0,17887.0,26.0,2019-01-03,Sephora
108924,6780,10.0,33298.0,30.0,2019-01-04,Sephora
166063,6780,0.0,18389.0,15.0,2019-01-05,Sephora


In [67]:
merged_focal_data = pd.merge(left=focal_visits, right=focal_social_data, how='left', on='date')

In [72]:
merged_focal_data['localized_reviews_fb'] = merged_focal_data['num_review_fb'] * merged_focal_data['proportion_of_visits']
merged_focal_data['localized_reviews_ig'] = merged_focal_data['num_review_ig'] * merged_focal_data['proportion_of_visits']
merged_focal_data['localized_reviews_tw'] = merged_focal_data['num_review_tw'] * merged_focal_data['proportion_of_visits']

In [73]:
merged_focal_data.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand_x,lat,lon,ID,Name,Genre,...,total_visits_across_stores,proportion_of_visits,id,num_review_fb,num_review_ig,num_review_tw,brand_y,localized_reviews_fb,localized_reviews_ig,localized_reviews_tw
0,2019-06-01,22f-222@62j-shx-fcq,78,221.06,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,...,145706.0,0.02004,6780.0,13.0,23540.0,7.0,Sephora,0.260525,471.749962,0.140282
1,2019-06-02,22f-222@62j-shx-fcq,49,573.31,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,...,146600.0,0.020191,6780.0,2.0,29906.0,8.0,Sephora,0.040382,603.831924,0.161528
2,2019-06-03,22f-222@62j-shx-fcq,44,2963.59,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,...,146863.0,0.020264,6780.0,0.0,11853.0,74.0,Sephora,0.0,240.186623,1.49952
3,2019-06-04,22f-222@62j-shx-fcq,60,1120.24,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,...,147062.0,0.020202,6780.0,44.0,15565.0,75.0,Sephora,0.888904,314.44979,1.515177
4,2019-06-05,22f-222@62j-shx-fcq,63,921.26,Sephora,42.360375,-71.056208,6780.0,Sephora,BRN,...,146865.0,0.020257,6780.0,643.0,17600.0,145.0,Sephora,13.025057,356.517891,2.937221


In [13]:
with open('data/travel_time.pkl', 'rb') as file:
    travel_time = pickle.load(file)

In [14]:
travel_time

{('zzw-224@62k-p96-s5z', 'zzw-223@62k-ns4-pn5'): '20 mins',
 ('zzw-224@62k-p96-s5z', 'zzy-222@62k-pd8-975'): '20 mins',
 ('zzw-224@62k-p96-s5z', '237-222@62k-p8v-z4v'): '16 mins',
 ('zzw-224@62k-p96-s5z', '222-222@62k-p8v-2p9'): '12 mins',
 ('zzw-224@62k-p96-s5z', '229-222@62k-p76-d9z'): '14 mins',
 ('zzw-224@62k-p96-s5z', '23j-222@62k-p8p-kxq'): '15 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-22v@62k-p73-2p9'): '15 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-22c@62k-p76-cwk'): '13 mins',
 ('zzw-224@62k-p96-s5z', '222-222@62j-qt2-yy9'): '11 mins',
 ('zzw-224@62k-p96-s5z', '22g-222@62j-sgs-vzz'): '26 mins',
 ('zzw-224@62k-p96-s5z', 'zzy-222@62j-qsv-hnq'): '10 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-222@62k-p94-ygk'): '10 mins',
 ('zzw-224@62k-p96-s5z', '225-222@62k-p9b-k9f'): '5 mins',
 ('zzw-224@62k-p96-s5z', '223-222@62k-p6v-8d9'): '15 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-222@62k-p74-mff'): '11 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-224@62k-p9f-835'): '15 mins',
 ('zzw-224@62k-p96-s5z', 'zzw-222@62k-p9p