In [1]:
import pandas as pd
import numpy as np
import dateutil.parser
import json
import re
import string 
import math
from difflib import SequenceMatcher
from urlparse import urlparse
%matplotlib inline
from __future__ import division
import pickle

In [2]:
# Load master data
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

## Scrub df of missing or unwanted values

In [3]:
# Load data into pd.df
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

# Remove immediately unwanted columns
# Either too unavailable or not useful
df = df.drop(columns=[
    'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Placements', 'pages',
    'FriendsOfConnections', 'ExcludedConnections', 'Gender', 'Generation',
    'Politics', 'CustomAudience', 'SourceFile', 'SourceZip', 'Language'
])

# Remove rows will no CreationDate value
df = df[pd.notnull(df['CreationDate'])]

# Reset index after dropping columns
df = df.reset_index(drop=True)

# Fix float columns
integer_cols = ['Clicks','Impressions','AdSpend']
for n, e in df.iterrows():
    for col in integer_cols:
        if math.isnan(e[col]):
            df.at[n,col] = 0.0
            
# Fix AdText column
for n, e in enumerate(df['AdText']):
    curr_e = re.sub(r'http\S+', '', e)
    curr_e = curr_e.replace('?????? ??? ????? ? ??????????', '')
    curr_e = curr_e.replace('Subscribe to our channel:','')
    curr_e = curr_e.replace('Follow my Facebook:','')
    curr_e = curr_e.replace('Follow me on Instagram:','')
    curr_e = curr_e.replace('Follow me on Twitter:','')
    df.at[n, 'AdText'] = curr_e

# Fix some string cols
string_cols = ['LandingPage','Location','Interests','AdSpendCurrency']
for string_col in string_cols:
    df[string_col] = df[string_col].astype(str)
    for n, e in enumerate(df[string_col]):
        if e == 'nan':
            df.at[n, string_col] = 'unavailable'

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

## Create new columns for better covariates

In [4]:
# Image number 1..n (N images)

# Open pickled ad id to file id map
ad_id_file_id_map_path = '/Users/drewnleonard/Documents/thesis/data/pickle/ad_id_file_id_map_pruned.pickle'
with open(ad_id_file_id_map_path, 'rb') as f:
    ad_id_file_id_map = pickle.load(f)

# Open pickled file id to ad number map
file_id_ad_n_map_path = '/Users/drewnleonard/Documents/thesis/data/pickle/ad_id_map_pruned.pickle'
with open(file_id_ad_n_map_path, 'rb') as f:
    file_id_ad_n_map = pickle.load(f)

ad_numbers = []
for i, row in df.iterrows():
    
    # Get ad id
    ad_id = str(row['AdID'])

    # If current ad has no id
    # i.e., no associated graphic
    if ad_id not in ad_id_file_id_map:
        ad_numbers.append('Unavailable')
        continue
    
    # Get associated file id
    file_id = ad_id_file_id_map[ad_id]

    # Get associated ad number (as integer)
    ad_number = int(file_id_ad_n_map[file_id])
        
    ad_numbers.append(ad_number)

df['survey_number'] = ad_numbers

In [5]:
# AccountGroup
with open('/Users/drewnleonard/Documents/thesis/data/json/group_keys.json') as f:
    group_keys = json.load(f)
AccountGroup = []
for ad_id in df['AdID']:
    
    ad_id = str(ad_id)
    
    if ad_id in group_keys:
        AccountGroup.append(group_keys[ad_id])
    else:
        AccountGroup.append('Unavailable')
df['AccountGroup'] = AccountGroup

In [6]:
# AccountGroup from landing pages
exclude_landing_pages = ['/',
 '/10718-take-part-in-black-pride-survey/',
 '/16383-st-louis-killer-cop-caught-in-on-camera/',
 '/4678-police-says-run-them-over-and-goes-scot-free/',
 '/6189-black-woman-found-dead-in-jail-cell-after-arguing-with-detention-officers/',
 '/6411-texas-police-officer-found-not-guilty-for-killing-a-black-woman/',
 '/6948-officers-violently-beat-and-arrest-teen-just-for-asking-question/',
 '/6980-baltimore-cop-drag-black-teen-from-home-without-warrant/',
 '/6984-man-set-free-after-10-years-in-prison-because-police-Iied-in-court/',
 '/7167-blm-members-arrested-for-counter-protest-against-white-power-rally-in-georgic/',
 '/7474-black-families-embark-on-homeschooling-because-of-racial-bias-and-safety-concerns/',
 '/7660-utah-school-defends-white-teachers-use-of-the-n-word-in-class/',
 '/7868-officer-puts-his-gun-he-used-in-killing-black-teen-on-auction/',
 '/8347-ohio-authorities-keep-their-eyes-shut-at-kkk-style-death/',
 '/9599-orlando-black-victims-need-support/',
 '/AdTargeting','/events/1140553372667995/',
 '/events/1486230091674577/',
'/events/I748220542079708/','/us-news/2015/nov/19/syrian-refugees-in-america-fact-from-fiction-congress']

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

for n, e in df.iterrows():
    
    if e['AccountGroup'] != 'Unavailable':
        continue
    
    landing_page = urlparse(e['LandingPage'])
    path = landing_page.path
    
    if path in exclude_landing_pages:
        continue
    
    path = ''.join([i for i in path if not i.isdigit()])
    path = path.replace('/','')
    path = path.replace('-', ' ')
    
    curr_max = {
        'name': "",
        'score': 0
    }
    
    for group in set(df['AccountGroup']):
        if similar(group, path) > 0.7 and similar(group, path) > curr_max['score']:
            curr_max['name'] = group
            curr_max['score'] = similar(group, path)
                
    if curr_max['name'] != '':
        df.at[n, 'AccountGroup'] = curr_max['name']

In [7]:
# AccountGroup clustering
group_cluster_path = '/Users/drewnleonard/Documents/thesis/data/csv/group_clusters.csv'
group_cluster_df = pd.read_csv(group_cluster_path, names = ["g1_id", "g1_name", "g2_id", "g2_name"])

cluster_map = {}
for n, e in group_cluster_df.iterrows():
    if e['g1_name']:
        cluster_map[e['g1_name']] = "cluster_1"
    if e['g2_name']:
        cluster_map[e['g2_name']] = "cluster_2"


cluster_list = []
for n, e in df.iterrows():
    
    # Get account gruop name
    account_group_name = e['AccountGroup']
    
    # If account group is in the map ...
    if account_group_name in cluster_map:
        cluster_list.append(cluster_map[account_group_name])
    else:
        cluster_list.append('Unavailable')
        
df['AccountGroupCluster'] = cluster_list

In [8]:
# CreationDateFormatted
CreationDateFormatted = []
for CreationDate in df['CreationDate']:
    try:
        new_date = dateutil.parser.parse(CreationDate[:-7]).date()
        CreationDateFormatted.append(new_date)
    except:
        CreationDateFormatted.append(dateutil.parser.parse('2015-10-31'))
df['CreationDateFormatted'] = CreationDateFormatted

In [9]:
CreationDateInteger = []
for creation_date in df['CreationDateFormatted']:
    str_date = str(creation_date).split()[0]
    date_int = int(str_date.replace('-',''))
    CreationDateInteger.append(date_int)
df['CreationDateInteger'] = CreationDateInteger

In [10]:
# AgeAverage
AgeAverage = []
for age_string in df['Age']:
    l = re.findall(r'\d+', age_string)
    l = [float(n) for n in l]
    AgeAverage.append(sum(l) / float(len(l)))
df['AgeAverage'] = AgeAverage

In [11]:
# AgeAverageBin
df['AgeAverageBin'] = pd.qcut(df['AgeAverage'], 4, labels=["LowAge","MidAge","HighAge"],duplicates='drop')

In [12]:
# AdSpendBin
df['AdSpendBin'] = pd.qcut(df['AdSpend'], 3, labels=["low","mid","high"])

In [13]:
# ClicksBin
df['ClicksBin'] = pd.qcut(df['Clicks'], 3, labels=["low","mid","high"])

In [14]:
# ImpressionsBin
df["ImpressionsBin"] = pd.qcut(df['Impressions'], 3, labels=["low","mid","high"])

In [15]:
# Label interests
interests_path = "/Users/drewnleonard/Documents/thesis/data/json/interest_groups_gold.json"
with open(interests_path) as f:
    interest_keywords = json.load(f)

# Run through interests
interest_map = {}

for n, unique_interest in enumerate(df['Interests'].unique()):
        
    # Put unique interest in lower case
    unique_interest = unique_interest.lower()
    
    # Iterate over set of topics with interest keywords
    for topic, keywords in interest_keywords.iteritems():
        
        for keyword in keywords:
            if keyword in unique_interest:
                
                if unique_interest not in interest_map:
                    interest_map[unique_interest] = {}
                
                if topic not in interest_map[unique_interest]:
                    interest_map[unique_interest][topic] = 0
                
                interest_map[unique_interest][topic] += 1
            
for interest, topics in interest_map.iteritems():
    
    curr = {
        'count': 0,
        'name': ''
    }
     
    for topic_name, topic_count in topics.iteritems():
        
        if topic_count > curr['count']:
            curr['count'] = topic_count
            curr['name'] = topic_name
    
    interest_map[interest] = curr['name']

for n, e in enumerate(df['Interests']):
    if e.lower() in interest_map:
        df.at[n,'Interests'] = interest_map[e.lower()]
    else:
        df.at[n,'Interests'] = 'mixed'


In [16]:
# df.to_csv('~/Documents/thesis/data/csv/fb_gold.csv', index=False)

In [20]:
df.head(100)

Unnamed: 0,AdID,AdText,Clicks,Impressions,Age,CreationDate,LandingPage,Location,Interests,AdSpend,...,survey_number,AccountGroup,AccountGroupCluster,CreationDateFormatted,CreationDateInteger,AgeAverage,AgeAverageBin,AdSpendBin,ClicksBin,ImpressionsBin
0,374,Join us because we care. Black matters.,0.0,137.0,18 - 65+,06/10/15 02:59:53 AM PDT,https://www.facebook.com/Black-Matters-1579673...,United States: Baltimore (+20 km) Maryland; St...,unavailable,44.87,...,493,black matters,cluster_1,2015-06-10,20150610,41.5,MidAge,mid,low,mid
1,655,NOT EVERY BOY WANTS TO BE A SOLDIER. A beautif...,35.0,452.0,18 - 65+,06/23/15 07:04:01 AM PDT,https://www.facebook.com/LGBT-United-839497472...,Living In: United States,unavailable,184.81,...,2252,lgbt united,Unavailable,2015-06-23,20150623,41.5,MidAge,mid,mid,mid
2,664,"""People can tolerate two homosexuals they see ...",26.0,374.0,18 - 65+,06/23/15 07:02:40 AM PDT,https://www.facebook.com/LGBT-United-839497472...,Living In: United States,unavailable,99.95,...,Unavailable,lgbt united,Unavailable,2015-06-23,20150623,41.5,MidAge,mid,mid,mid
3,79,,0.0,31.0,18 - 65+,06/09/15 03:50:21 AM PDT,https://www.facebook.com/pages/L-for-life/9949...,United States,unavailable,33.59,...,1326,l for life,cluster_1,2015-06-09,20150609,41.5,MidAge,mid,low,low
4,325,California... knows how to party California......,4.0,326.0,18 - 65+,06/10/15 07:34:52 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,45.94,...,Unavailable,black matters,cluster_1,2015-06-10,20150610,41.5,MidAge,mid,mid,mid
5,326,"Since 2010, over 350 of our lives have been ta...",517.0,1478.0,18 - 65+,06/12/15 03:13:16 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States Baltimore Maryland: Ferguson, St...",unavailable,99.97,...,Unavailable,black matters,cluster_1,2015-06-12,20150612,41.5,MidAge,mid,high,mid
6,327,"'Just like Trayvon Martin, race mattered for A...",7.0,125.0,18 - 65+,06/11/15 06:51:30 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,34.77,...,Unavailable,black matters,cluster_1,2015-06-11,20150611,41.5,MidAge,mid,mid,mid
7,328,Race war started by Texas teacher A Texas four...,17.0,168.0,18 - 65+,06/11/15 07:03:58 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,31.54,...,Unavailable,black matters,cluster_1,2015-06-11,20150611,41.5,MidAge,mid,mid,mid
8,329,The image of 1938 shows several African Americ...,18.0,482.0,18 - 65+,06/15/15 07:21:33 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,90.65,...,Unavailable,black matters,cluster_1,2015-06-15,20150615,41.5,MidAge,mid,mid,mid
9,330,American Racists On The Road The racist...,24.0,524.0,18 - 65+,06/15/15 07:22:00 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,88.45,...,Unavailable,black matters,cluster_1,2015-06-15,20150615,41.5,MidAge,mid,mid,mid
