In [1]:
import pandas as pd
import numpy as np
import os
import sys
from io import StringIO
import csv

parent = os.path.dirname(os.getcwd())  # workaround to import from utils/
sys.path.append(parent)

from utils.db_utils import df_from_snowflake

In [8]:
herodb_query="""
WITH
    amenity as (
        SELECT
             keyword
            , sum(google_search_volume) as sv_last12m
       FROM STAGE.reporting_marketing.hero_db_amenities_search_volume_monthly
       WHERE TRUE
         --AND year_month >= dateadd(month, -12, (select max(year_month) from STAGE.reporting_marketing.hero_db_amenities_search_volume_monthly))
         AND year_month BETWEEN '2022-11-01' AND '2023-11-01'
         AND MARKET = '{market}'
       GROUP BY 1
   ),
    master as (
        SELECT
            object_type
            , template
            , keyword
            , object_type_keyword
            , location_id
            , location_name
            , word_form
            , amenity_keyword
            , prefix
        FROM STAGE.derived_marketing.hero_db_keywords_master
        WHERE
            TRUE
            AND is_blacklisted = 'False'
            AND is_blacklisted_amenity = 'False'
            AND MARKET='{market}'
    )
select
    master.*
,   amenity.sv_last12m
from master
inner join amenity using (keyword)
where
    TRUE
;
"""


In [9]:
#Load SEM files
def load_sem(market:str):
       df = pd.read_csv('/Users/datnguyen/Downloads/sem_' + market + '.csv', 
                     delimiter='\t', 
                     encoding='utf-16',
                     #nrows=10000,
                     skiprows=2, 
                     decimal='.',
                     thousands=',', 
                     encoding_errors='ignore',
                     on_bad_lines='skip',
                     na_values=' --',
                     usecols=['Search term', 'Impr.', 'Clicks', 'CR', 'ROAS', 'Bookings', 'Revenue'],
                     dtype={'Search term': str, 
                            'Impr.': int, 
                            'Clicks': int, 
                            'CR': str, 
                            'ROAS': str, 
                            'Bookings': float, 
                            'Revenue': str}
                     ).fillna(0)
       print('Search terms of SEM data of ' + market + ': ' + str(len(df)))
       
       #fix type
       df['CR'] = df['CR'].str.replace('%', '').astype(float) / 100
       df['ROAS'] = df['ROAS'].str.replace('%', '').astype(float) / 100
       df['Revenue'] = df['Revenue'].replace({'â‚¬': '', ',': ''}, regex=True).astype(float)
       df['Search term'] = df['Search term'].str.lower().str.strip()                              #simple transform before matching
       
       #Groupby Search term because records were on Campaign level
       df_agg = df.groupby(['Search term']).agg({'Impr.': 'sum',
                                                'Clicks': 'sum',
                                                'CR': 'mean',
                                                'ROAS': 'mean',
                                                'Bookings': 'sum',
                                                'Revenue': 'sum'
                                                }).reset_index()
       print('Search terms of SEM after deduplication: ' + str(len(df_agg)))
       return df_agg


#Load HeroDB
def load_herodb(market:str):
       herodb = df_from_snowflake(query=herodb_query.format(market=market))
       herodb['KEYWORD'] = herodb['KEYWORD'].str.lower().str.strip()                               #simple transform before matching
       print('Keywords from HeroDB download is: ' + str(len(herodb)))
       return herodb
       

def merge_df(df_herodb:pd.DataFrame, df_sem:pd.DataFrame):
       new_df = df_herodb.merge(df_sem,
                                how='outer',
                                left_on='KEYWORD',
                                right_on='Search term'
                               )
       
       return new_df


def write_df(df:pd.DataFrame, market:str):
       df.to_csv('/Users/datnguyen/Downloads/herodb_sem_matching_' + market + '.csv',
                 sep= '\t',
                 encoding='utf-8'
                 )
       print(str(len(df)) + ' records have been written')

def make_file(market):
       sem = load_sem(market=market)
       herodb = load_herodb(market=market)
       merged = merge_df(herodb, sem)
       write_df(merged, market)
       return merged



In [10]:
#DE
merged_DE = make_file(market='DE')



Search terms of SEM data of DE: 10181274
Search terms of SEM after deduplication: 3268297
Keywords from HeroDB download is: 32424


  values = values.astype(str)


3277963 records have been written
HeroDB Keywords that matched: 9623


In [11]:
print('HeroDB Keywords that matched: ' +  str(len(merged_DE[(merged_DE['Search term'].isna()==False) & 
                                                            (merged_DE['KEYWORD'].isna()==False)])))

HeroDB Keywords that matched: 22801


In [13]:
merged_DE[merged_DE['KEYWORD']=='ferienhaus kroatien mit pool']

Unnamed: 0,KEYWORD,OBJECT_TYPE,TEMPLATE,OBJECT_TYPE_KEYWORD,LOCATION_ID,LOCATION_NAME,WORD_FORM,AMENITY_KEYWORD,PREFIX,SV_LAST12M,Search term,Impr.,Clicks,CR,ROAS,Bookings,Revenue
24259,ferienhaus kroatien mit pool,Amenities KWs DE,[accommodation kw] [location kw] [prefix] [ame...,ferienhaus,5460aeaaa3139,kroatien,singular,pool,mit,284400.0,ferienhaus kroatien mit pool,84082.0,19710.0,2.885133,1.008281,129.74,32991.62


In [15]:
#US
merged_US = make_file(market='US')


Search terms of SEM data of US: 17007575
Search terms of SEM after deduplication: 5605498
Keywords from HeroDB download is: 10946


  values = values.astype(str)


5613775 records have been written


In [16]:
print('HeroDB Keywords that matched: ' +  str(len(merged_US[(merged_US['Search term'].isna()==False) &
                                                            (merged_US['KEYWORD'].isna()==False)])))

HeroDB Keywords that matched: 2689


In [17]:
merged_US[merged_US['KEYWORD']=='cabin pet friendly pigeon forge']

Unnamed: 0,KEYWORD,OBJECT_TYPE,TEMPLATE,OBJECT_TYPE_KEYWORD,LOCATION_ID,LOCATION_NAME,WORD_FORM,AMENITY_KEYWORD,PREFIX,SV_LAST12M,Search term,Impr.,Clicks,CR,ROAS,Bookings,Revenue
3253,cabin pet friendly pigeon forge,Amenities KWs US,[accommodation kw] [amenity kw] [location kw],cabin,5460aea288f77,pigeon forge,singular,pet friendly,,126500.0,cabin pet friendly pigeon forge,5.0,0.0,0.0,0.0,0.0,0.0


# Quick look into matching results

In [None]:
#HeroDB keywords without SEM data
len(merged_DE[merged_DE['Search term'].isna()])

In [None]:
len(merged_DE)

In [None]:
merged_DE

In [None]:
merged_DE[(merged_DE['Search term']=="ferienhaus kroatien mit pool")]

In [None]:
herodb_de = load_herodb('DE')

In [None]:
herodb_de[herodb_de['KEYWORD']=='ferienhaus kroatien mit pool']

# Appendix

In [None]:
#An approach to open file as bytes to fix bugs and save it
file = open('/Users/datnguyen/Downloads/sem_de.csv', 'r', encoding='utf-16')
csvreader = csv.reader(file)



# ignore first 2 rows
next(csvreader)
next(csvreader)

# store other rows
rows = []
for row in csvreader:
        rows.append(row)

file.close()

with open('/Users/datnguyen/Downloads/sem_de_test.csv', 'w', encoding='UTF-8', newline='') as f:
    writer = csv.writer(f, delimiter='\t')
    # write multiple rows
    writer.writerows(rows)