In [29]:
import pandas as pd
import numpy as np
import dateutil.parser
import json
import re
import string 
import math
%matplotlib inline

In [30]:
# Load master data
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

## Scrub df of missing or unwanted values

In [31]:
# Load data into pd.df
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

# Remove immediately unwanted columns
# Either too unavailable or not useful
df = df.drop(columns=[
    'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Placements', 'pages',
    'FriendsOfConnections', 'ExcludedConnections', 'Gender', 'Generation',
    'Politics', 'CustomAudience', 'SourceFile', 'SourceZip', 'Language'
])

# Remove rows will no CreationDate value
df = df[pd.notnull(df['CreationDate'])]

# Reset index after dropping columns
df = df.reset_index(drop=True)

# Fix float columns
integer_cols = ['Clicks','Impressions','AdSpend']
for n, e in df.iterrows():
    for col in integer_cols:
        if math.isnan(e[col]):
            df.at[n,col] = 0.0
            
# Fix AdText column
for n, e in enumerate(df['AdText']):
    curr_e = re.sub(r'http\S+', '', e)
    curr_e = curr_e.replace('?????? ??? ????? ? ??????????', '')
    curr_e = curr_e.replace('Subscribe to our channel:','')
    curr_e = curr_e.replace('Follow my Facebook:','')
    curr_e = curr_e.replace('Follow me on Instagram:','')
    curr_e = curr_e.replace('Follow me on Twitter:','')
    df.at[n, 'AdText'] = curr_e

# Fix some string cols
string_cols = ['LandingPage','Location','Interests','AdSpendCurrency']
for string_col in string_cols:
    df[string_col] = df[string_col].astype(str)
    for n, e in enumerate(df[string_col]):
        if e == 'nan':
            df.at[n, string_col] = 'unavailable'

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

## Create new columns for better covariates

In [32]:
# AccountGroup
with open('/Users/drewnleonard/Documents/thesis/data/json/group_keys.json') as f:
    group_keys = json.load(f)
AccountGroup = []
for ad_id in df['AdID']:
    
    ad_id = str(ad_id)
    
    if ad_id in group_keys:
        AccountGroup.append(group_keys[ad_id])
    else:
        AccountGroup.append('Unavailable')
df['AccountGroup'] = AccountGroup

In [33]:
# CreationDateFormatted
CreationDateFormatted = []
for CreationDate in df['CreationDate']:
    try:
        new_date = dateutil.parser.parse(CreationDate[:-7]).date()
        CreationDateFormatted.append(new_date)
    except:
        CreationDateFormatted.append(dateutil.parser.parse('2015-10-31'))
df['CreationDateFormatted'] = CreationDateFormatted

In [34]:
CreationDateInteger = []
for creation_date in df['CreationDateFormatted']:
    str_date = str(creation_date).split()[0]
    date_int = int(str_date.replace('-',''))
    CreationDateInteger.append(date_int)
df['CreationDateInteger'] = CreationDateInteger

In [35]:
# AgeAverage
AgeAverage = []
for age_string in df['Age']:
    l = re.findall(r'\d+', age_string)
    l = [float(n) for n in l]
    AgeAverage.append(sum(l) / float(len(l)))
df['AgeAverage'] = AgeAverage

In [36]:
# AgeAverageBin
df['AgeAverageBin'] = pd.qcut(df['AgeAverage'], 4, labels=["LowAge","MidAge","HighAge"],duplicates='drop')

In [37]:
# AdSpendBin
df['AdSpendBin'] = pd.qcut(df['AdSpend'], 3, labels=["low","mid","high"])

In [38]:
# ClicksBin
df['ClicksBin'] = pd.qcut(df['Clicks'], 3, labels=["low","mid","high"])

In [39]:
# ImpressionsBin
df["ImpressionsBin"] = pd.qcut(df['Impressions'], 3, labels=["low","mid","high"])

In [40]:
# For All InterestSet \in Advertisements:
# For All InterestKeywords \in InterestGroups:
# If InterestKeywords

In [41]:
interests_path = "/Users/drewnleonard/Documents/thesis/data/json/interest_groups_gold.json"
with open(interests_path) as f:
    interest_keywords = json.load(f)

# Run through interests
interest_map = {}

for n, unique_interest in enumerate(df['Interests'].unique()):
        
    # Put unique interest in lower case
    unique_interest = unique_interest.lower()
    
    # Iterate over set of topics with interest keywords
    for topic, keywords in interest_keywords.iteritems():
        
        for keyword in keywords:
            if keyword in unique_interest:
                
                if unique_interest not in interest_map:
                    interest_map[unique_interest] = {}
                
                if topic not in interest_map[unique_interest]:
                    interest_map[unique_interest][topic] = 0
                
                interest_map[unique_interest][topic] += 1
                

for interest, topics in interest_map.iteritems():
    
    curr = {
        'count': 0,
        'name': ''
    }
     
    for topic_name, topic_count in topics.iteritems():
        
        if topic_count > curr['count']:
            curr['count'] = topic_count
            curr['name'] = topic_name
    
    interest_map[interest] = curr['name']

for n, e in enumerate(df['Interests']):
    if e.lower() in interest_map:
        df.at[n,'Interests'] = interest_map[e.lower()]
    else:
        df.at[n,'Interests'] = 'mixed'


In [42]:
df.to_csv('~/Documents/thesis/data/csv/fb_gold.csv',index=False)

In [None]:
df.head(100)