In [1]:
import pickle
import os 
import itertools

import pandas as pd
import numpy as np
import tensorflow as tf

import scipy
import sklearn

from utils.config import load_config
from sklearn.preprocessing import OrdinalEncoder

import matplotlib.pyplot as plt

tf.config.run_functions_eagerly(True)

In [2]:
def save_pickle(filename, file):
    filehandler = open(filename, "wb")
    pickle.dump(file, filehandler)
    filehandler.close()

def load_pickle(filename):
    file = open(filename,'rb')
    object_file = pickle.load(file)
    file.close()
    return object_file

In [3]:
data_path = "../problem_merged_data.csv"
df = pd.read_csv(data_path)
df.keys()

Index(['group', 'item', 'channel', 'date', 'bid', 'budget', 'engagements',
       'page_views', 'clicks', 'active_days', 'media_spend', 'media_cpc',
       'cpe', 'headline', 'storySummary', 'IABCategory', 'targetGeo',
       'targetInterest', 'targetAge', 'targetOs', 'targetDevices',
       'targetGender', 'targetLanguages', 'CATEGORY_1'],
      dtype='object')

In [4]:
print(df["headline"].iloc[200], df["storySummary"].iloc[200])

Access Our AFib Guide Access Cleveland Clinic's free Afib guide to learn about treatment options from the nation's #1 heart program. Get answers today.


### Check Target Geo + Interest

In [5]:
# check if len total == len list
# se tiver overlap não será igual (!) - um item que aparece em multiplos lugares
list_geotype = list(df["targetGeo"])

# GeoTypes + Names
# GEO TYPES: {'COUNTRY', 'CITY', 'BUCKET', 'DMA', 'POSTALCODE', 'REGION'}
def get_geo_types(elem):
    _list = []
    elem_split_list = elem.split(';')
    
    for elem in elem_split_list:
        if len(elem) > 1:
            _dict = eval(elem)
            _list.append(_dict['geoType'])
    
    return list(set(_list))

In [6]:
_list_all_geotypes = []
for elem in list_geotype:
    _list_all_geotypes.extend(get_geo_types(elem))

print(set(_list_all_geotypes))

{'COUNTRY', 'DMA', 'BUCKET', 'POSTALCODE', 'CITY', 'REGION'}


In [7]:
_NAME_KEYS = ['countryName', 'cityName', 'bucketName', 'regionName', 'postalCodeName', 'regionName']

TYPE_KEYS = ['COUNTRY', 'CITY', 'BUCKET', 'DMA', 'POSTALCODE', 'REGION']

DICT_TYPE_KEYS = {'COUNTRY': 'countryName',
                  'CITY': 'cityName',
                  'BUCKET': 'bucketName',
                  'DMA': 'regionName',
                  'POSTALCODE': 'postalCodeName',
                  'REGION': 'regionName'}

INFO_NAMES = ['postalCodeName',  'cityName', 'regionName', 'bucketName', 'countryName']

In [8]:
def get_geo_names(elem):
    _dict_geo_names = {_key:[] for _key in INFO_NAMES}
    elem_split_list = elem.split(';')
    
    for elem in elem_split_list:
        if len(elem) > 1:
            _dict = eval(elem)
            
            for _key in INFO_NAMES:
                if _key in _dict.keys():
                    _dict_geo_names[_key].append(_dict[_key])
    
    return _dict_geo_names
    

def gen_text_geo(elem):
    _dict_geo_names = get_geo_names(elem)
    text = ""
    
    for _key in _dict_geo_names.keys():
        if _dict_geo_names[_key]:
            _str = _key.replace("Name", "").replace("Code", "Code")
            text+=f"{_str} "
            for _words in _dict_geo_names[_key]:
                text+=f"{_words} "
    return text[:-1].lower()

In [9]:
for idx, elem in enumerate(list_geotype[-100:-98]):
    print(idx, gen_text_geo(elem))

0 region illinois indiana iowa kentucky michigan minnesota missouri nebraska north dakota ohio south dakota wisconsin country united states united states united states united states united states united states united states united states united states united states united states united states
1 region illinois indiana iowa kentucky michigan minnesota missouri nebraska north dakota ohio south dakota wisconsin country united states united states united states united states united states united states united states united states united states united states united states united states


In [10]:
# all with descriptions?
df_filt = df.copy()
df_filt = df_filt.fillna(value="UNK")

temp = df_filt["targetInterest"].iloc[100]
temp

'{"channelId":"YAHOO","channelName":"Yahoo","interestId":"oath3p:52948934","interestName":"Hypertension Propensity - Reach Tier 1 [3rd Party Data > Adstra (ALC) > Syndicated > Health & Wellness > Disease Propensity by Type/Rx Use]","interestDescription":"Individuals likely to have a Cardiovascular condition, such as Hypertension, that is treated with a Prescription/Rx medication, who fall into Reach Tier 1.","interestType":"oath3p"};'

In [11]:
def gen_target_interest(elem):
    if elem != 'UNK':
        elem_split_list = elem.split(';')
        
        for _elem in elem_split_list:
            if len(_elem) > 1:
                _elem = _elem.replace(":true}", ":True}").replace(":false}", ":False}")
                _dict = eval(_elem)
                return "interest description: "+_dict["interestDescription"]
    else:
        return "unknown interest description"

In [12]:
_temp_text = gen_target_interest(temp)
_temp_text

'interest description: Individuals likely to have a Cardiovascular condition, such as Hypertension, that is treated with a Prescription/Rx medication, who fall into Reach Tier 1.'

#### Vocabulary

In [13]:
import re
from nltk.corpus import stopwords


def remove_special_characters(text):
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text

def convert_to_lowercase(text):
    lowercased_text = text.lower()
    return lowercased_text

def clean_all(text):
    text = remove_special_characters(text)
    text = convert_to_lowercase(text)
    return text

In [14]:
_list_vocab = []

for _, elem in enumerate(df_filt["targetGeo"]):
    text = clean_all(gen_text_geo(elem))
    _list_vocab.extend(text.split(" "))
    
for _, elem in enumerate(df_filt["targetInterest"]):
    text = clean_all(gen_target_interest(elem))
    _list_vocab.extend(text.split(" "))


for _, elem in enumerate(df_filt["headline"]):
    text = clean_all(elem)
    _list_vocab.extend(text.split(" "))
    
for _, elem in enumerate(df_filt["storySummary"]):
    text = clean_all(elem)
    _list_vocab.extend(text.split(" "))


_list_vocab = list(set(_list_vocab))
_list_vocab.sort()
_list_vocab = _list_vocab[1:]

print(len(_list_vocab))

1110


In [15]:
_list_vocab = list(set(_list_vocab))
with open('aux_data/vocab_file.txt', 'w') as f:
    for line in _list_vocab:
        f.write(f"{line}\n")
