This program will read the filtered data, and peform the calculations for one focal brand

In [1]:
import pickle
import datetime
import pandas as pd
import numpy as np
from tqdm import tqdm
pd.options.mode.chained_assignment = None  # default='warn'

In [2]:
with open('top_brands.pickle', 'rb') as file:
    top_brands = pickle.load(file)

top_brands.head()

Unnamed: 0,SUB_CATEGORY,BRANDS,Total_Spend,Total_Visits,Unique_PLACEKEY_Count
0,"Cosmetics, Beauty Supplies, and Perfume Stores",Sephora,1913415.32,702927,15
1,"Cosmetics, Beauty Supplies, and Perfume Stores",ULTA Beauty,1339844.15,404546,20
2,Full-Service Restaurants,Olive Garden,1058844.15,125651,15
3,Full-Service Restaurants,The Cheesecake Factory,974090.35,473745,5
4,Warehouse Clubs and Supercenters,Target,41977753.97,2053582,50


In [3]:
with open('brand_catalog_visits.pickle', 'rb') as file:
    brand_catalog_visits = pickle.load(file)

brand_catalog_visits.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand_visitation,lat,lon,brand_standard,ID,Name,Genre,Type,Classification,Status,Name_Standard
0,2019-06-01,zzw-222@62j-sgj-q2k,5,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
1,2019-06-02,zzw-222@62j-sgj-q2k,1,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
2,2019-06-03,zzw-222@62j-sgj-q2k,6,859.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
3,2019-06-04,zzw-222@62j-sgj-q2k,6,30.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
4,2019-06-05,zzw-222@62j-sgj-q2k,8,193.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness


In [4]:
with open('social_data.pickle', 'rb') as file:
    social_data = pickle.load(file)

social_data.head()

Unnamed: 0,id,num_review_fb,num_review_ig,num_review_tw,date,brand
1406,5308,866.0,1005.0,6.0,2019-01-01,Acura
1416,5343,2.0,1087.0,0.0,2019-01-01,Aldo
1422,5352,2.0,1.0,0.0,2019-01-01,Allen Edmonds
1434,5381,1136.0,6.0,16.0,2019-01-01,Amtrak
1436,5387,551.0,2926.0,2.0,2019-01-01,Anthropologie


### Calculating the localized number of reviews for all brands having social data available

Function to perform calculations using Step 2

In [5]:
def get_brand_visit_data(brand_id):
    brand_visit_data = None
    brand_social_data = None
    
    brand_visit_data = brand_catalog_visits[brand_catalog_visits['ID'] == brand_id]
    brand_social_data = social_data[social_data['id'] == brand_id]
    
    brand_visit_data.loc[:,'visits_past_60_days'] = brand_visit_data.groupby(by=['lat', 'lon'], group_keys=False)[['lat', 'lon', 'date', 'visits_by_day']].apply(lambda x: x.sort_values('date').rolling(60)['visits_by_day'].sum().shift(1)).squeeze()
    brand_visit_data.loc[:,'visits_past_3_days'] = brand_visit_data.groupby(by=['lat', 'lon'], group_keys=False)[['lat', 'lon', 'date', 'visits_by_day']].apply(lambda x: x.sort_values('date').rolling(3)['visits_by_day'].sum().shift(1)).squeeze()
    
    brand_visit_data.loc[:,'total_visits_across_stores_60_days'] = brand_visit_data.groupby(by=['date'])['visits_past_60_days'].transform('sum')
    brand_visit_data.loc[:,'proportion_of_visits_60_days'] = brand_visit_data['visits_past_60_days']/brand_visit_data['total_visits_across_stores_60_days']
    
    def get_localized_fb(df_rec):
        localized_fb_reviews = None
        soc_data = brand_social_data[brand_social_data['date'] == df_rec.name]
        
        if (len(soc_data) != 0):    
            localized_fb_reviews = soc_data['num_review_fb'].values[0] * df_rec
        else:
            localized_fb_reviews = np.nan * df_rec
        
        return localized_fb_reviews
    
    def get_localized_ig(df_rec):
        localized_ig_reviews = None
        soc_data = brand_social_data[brand_social_data['date'] == df_rec.name]
        
        if (len(soc_data) != 0):    
            localized_ig_reviews = soc_data['num_review_ig'].values[0] * df_rec
        else:
            localized_ig_reviews = np.nan * df_rec
        
        return localized_ig_reviews
    
    def get_localized_tw(df_rec):
        localized_tw_reviews = None
        soc_data = brand_social_data[brand_social_data['date'] == df_rec.name]
        
        if (len(soc_data) != 0):    
            localized_tw_reviews = soc_data['num_review_tw'].values[0] * df_rec
        else:
            localized_tw_reviews = np.nan * df_rec
        
        return localized_tw_reviews
    
    brand_visit_data.loc[:,'localized_fb_reviews_60_days'] = brand_visit_data.groupby(by=['date'])['proportion_of_visits_60_days'].apply(get_localized_fb).droplevel(0)
    brand_visit_data.loc[:,'localized_ig_reviews_60_days'] = brand_visit_data.groupby(by=['date'])['proportion_of_visits_60_days'].apply(get_localized_ig).droplevel(0)
    brand_visit_data.loc[:,'localized_tw_reviews_60_days'] = brand_visit_data.groupby(by=['date'])['proportion_of_visits_60_days'].apply(get_localized_tw).droplevel(0)
    
    return brand_visit_data

In [6]:
valid_brand_ids = social_data['id'].unique().tolist()

brand_visit_local_reviews = None

for i in tqdm(range(0, len(valid_brand_ids))):    
    brand_data_df = get_brand_visit_data(valid_brand_ids[i])

    if i == 0: # For the very first time, store the dataframe. For any next iterations, just perform the concatenation
        brand_visit_local_reviews = brand_data_df
    else:
        brand_visit_local_reviews = pd.concat([brand_visit_local_reviews, brand_data_df], axis=0)

100%|██████████| 427/427 [03:11<00:00,  2.24it/s]


In [7]:
with open('brand_visit_local_reviews.pickle', 'wb') as file:
    pickle.dump(brand_visit_local_reviews, file)