In [1]:
import pickle
import pandas as pd
import datetime
import numpy as np

### Getting all the focal brands

In [2]:
top_brands = pd.read_csv('data/top_brands_by_category.csv')
top_brands.head()

Unnamed: 0,SUB_CATEGORY,BRANDS,Total_Spend,Total_Visits,Unique_PLACEKEY_Count
0,"Cosmetics, Beauty Supplies, and Perfume Stores",Sephora,1913415.32,702927,15
1,"Cosmetics, Beauty Supplies, and Perfume Stores",ULTA Beauty,1339844.15,404546,20
2,Full-Service Restaurants,Olive Garden,1058844.15,125651,15
3,Full-Service Restaurants,The Cheesecake Factory,974090.35,473745,5
4,Warehouse Clubs and Supercenters,Target,41977753.97,2053582,50


In [3]:
focal_brands = top_brands['BRANDS'].to_list()
focal_brands

['Sephora',
 'ULTA Beauty',
 'Olive Garden',
 'The Cheesecake Factory',
 'Target',
 'Walmart',
 'Anthropologie',
 "Victoria's Secret"]

### Getting the brands catalog

In [4]:
brands_catalog = pd.read_csv('data/catalog.tsv', sep='\t', header=None, names=['ID', 'Name', 'Genre', 'Type', 'Classification', 'Status'])
brands_catalog['Name_Standard'] = brands_catalog['Name'].apply(lambda x: x.strip().lower()) # For comparison with visitation_data.csv
brands_catalog.head()

Unnamed: 0,ID,Name,Genre,Type,Classification,Status,Name_Standard
0,1,+44 (Band),ART,Interscope-Geffen-A&M,Alternative,Inactive,+44 (band)
1,2,10 Years,ART,Unsigned,Rock,Active,10 years
2,3,12 Stones,ART,Concord Records,Rock,Active,12 stones
3,4,13th Floor Elevators,ART,International Artists,Rock,Inactive,13th floor elevators
4,5,2 Pistols,ART,Universal Motown Records,Hip Hop/Rap,Active,2 pistols


### Getting the brand visitation data

In [5]:
brands_visits = pd.read_csv('data/revision_visits_revenue_2019.csv')
brands_visits['brand_standard'] = brands_visits['brand'].apply(lambda x: x.strip().lower()) # For comparison with catalog.tsv
brands_visits['date'] = brands_visits['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())
brands_visits = brands_visits.rename(columns={'brand': 'brand_visitation'})
brands_visits.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand_visitation,lat,lon,brand_standard
0,2019-06-01,zzw-222@62j-sgj-q2k,5,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
1,2019-06-02,zzw-222@62j-sgj-q2k,1,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
2,2019-06-03,zzw-222@62j-sgj-q2k,6,859.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
3,2019-06-04,zzw-222@62j-sgj-q2k,6,30.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness
4,2019-06-05,zzw-222@62j-sgj-q2k,8,193.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness


In [6]:
print('Unique Brands having visitation data: ', len(brands_visits['brand_standard'].unique()))

Unique Brands having visitation data:  999


Calculating all the brands which have visitation data, and they are present in the catalog as well (have a valid ID which can be used to fetch the social data)

In [7]:
brand_catalog_visits = pd.merge(left=brands_visits, right=brands_catalog, how="left", left_on="brand_standard", right_on="Name_Standard")
brand_catalog_visits.head()

Unnamed: 0,date,PLACEKEY,visits_by_day,spend_by_day,brand_visitation,lat,lon,brand_standard,ID,Name,Genre,Type,Classification,Status,Name_Standard
0,2019-06-01,zzw-222@62j-sgj-q2k,5,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
1,2019-06-02,zzw-222@62j-sgj-q2k,1,0.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
2,2019-06-03,zzw-222@62j-sgj-q2k,6,859.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
3,2019-06-04,zzw-222@62j-sgj-q2k,6,30.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness
4,2019-06-05,zzw-222@62j-sgj-q2k,8,193.0,Orangetheory Fitness,42.350592,-71.153024,orangetheory fitness,67926.0,Orangetheory Fitness,BRN,Retail,Gyms & Health Clubs,,orangetheory fitness


In [8]:
brand_catalog_visits = brand_catalog_visits[~pd.isna(brand_catalog_visits['ID'])]

In [9]:
print('Unique Brands having visitation data and Valid ID from catalog: ', len(brand_catalog_visits['Name_Standard'].unique()))

Unique Brands having visitation data and Valid ID from catalog:  435


### Getting the social data

In [10]:
social_data = pd.read_csv('data/social.csv')
social_data['date'] = social_data['date'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d').date())
social_data.head()

Unnamed: 0,id,num_review_fb,num_review_ig,num_review_tw,date,brand
0,9,5286.0,13737.0,18.0,2019-01-01,2Pac
1,11,1526.0,2559.0,210.0,2019-01-01,30 Seconds To Mars
2,12,20.0,604.0,17.0,2019-01-01,311 (Band)
3,23,1.0,1.0,0.0,2019-01-01,6ixth Sense
4,26,50.0,507.0,0.0,2019-01-01,8Ball & MJG


Getting all the unique IDs of brands who have visitation data and a respective ID in brands catalog

In [11]:
brand_catalog_visits_unique_id = list(np.int64(brand_catalog_visits['ID'].unique()))

Getting the social data for brands which have visitation data and catalog ID

In [12]:
social_data = social_data[social_data['id'].isin(brand_catalog_visits_unique_id)]

In [13]:
print('Unique brands having social data, visitation data and Valid ID from catalog: ', len(social_data['id'].unique()))

Unique brands having social data, visitation data and Valid ID from catalog:  427


In [14]:
len(social_data)

140288

Saving all the important dataframes as pickle files to be loaded again

In [15]:
with open('top_brands.pickle', 'wb') as file:
    pickle.dump(top_brands, file)

In [16]:
with open('brand_catalog_visits.pickle', 'wb') as file:
    pickle.dump(brand_catalog_visits, file)

In [17]:
with open('social_data.pickle', 'wb') as file:
    pickle.dump(social_data, file)