In [4]:
import pandas as pd
import numpy as np
import dateutil.parser
import json
import re
import string 
import math
%matplotlib inline

In [5]:
# Load master data
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

## Scrub df of missing or unwanted values

In [6]:
# Load data into pd.df
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

# Remove immediately unwanted columns
# Either too unavailable or not useful
df = df.drop(columns=[
    'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Placements', 'pages',
    'FriendsOfConnections', 'ExcludedConnections', 'Gender', 'Generation',
    'Politics', 'CustomAudience', 'SourceFile', 'SourceZip', 'Language'
])

# Remove rows will no CreationDate value
df = df[pd.notnull(df['CreationDate'])]

# Reset index after dropping columns
df = df.reset_index(drop=True)

# Fix float columns
integer_cols = ['Clicks','Impressions','AdSpend']
for n, e in df.iterrows():
    for col in integer_cols:
        if math.isnan(e[col]):
            df.at[n,col] = 0.0
            
# Fix AdText column
for n, e in enumerate(df['AdText']):
    curr_e = re.sub(r'http\S+', '', e)
    curr_e = curr_e.replace('?????? ??? ????? ? ??????????', '')
    curr_e = curr_e.replace('Subscribe to our channel:','')
    curr_e = curr_e.replace('Follow my Facebook:','')
    curr_e = curr_e.replace('Follow me on Instagram:','')
    curr_e = curr_e.replace('Follow me on Twitter:','')
    df.at[n, 'AdText'] = curr_e

# Fix some string cols
string_cols = ['LandingPage','Location','Interests','AdSpendCurrency']
for string_col in string_cols:
    df[string_col] = df[string_col].astype(str)
    for n, e in enumerate(df[string_col]):
        if e == 'nan':
            df.at[n, string_col] = 'unavailable'

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

## Create new columns for better covariates

In [7]:
# CreationDateFormatted
CreationDateFormatted = []
for CreationDate in df['CreationDate']:
    try:
        new_date = dateutil.parser.parse(CreationDate[:-7]).date()
        CreationDateFormatted.append(new_date)
    except:
        CreationDateFormatted.append(dateutil.parser.parse('2015-10-31'))
df['CreationDateFormatted'] = CreationDateFormatted

In [8]:
# AgeAverage
AgeAverage = []
for age_string in df['Age']:
    l = re.findall(r'\d+', age_string)
    l = [float(n) for n in l]
    AgeAverage.append(sum(l) / float(len(l)))
df['AgeAverage'] = AgeAverage

In [9]:
# AgeAverageBin
df['AgeAverageBin'] = pd.qcut(df['AgeAverage'], 4, labels=["LowAge","MidAge","HighAge"],duplicates='drop')

In [10]:
# AdSpendBin
df['AdSpendBin'] = pd.qcut(df['AdSpend'], 3, labels=["low","mid","high"])

In [11]:
# ClicksBin
df['ClicksBin'] = pd.qcut(df['Clicks'], 3, labels=["low","mid","high"])

In [12]:
# ImpressionsBin
df["ImpressionsBin"] = pd.qcut(df['Impressions'], 3, labels=["low","mid","high"])

In [13]:
interests_path = "/Users/drewnleonard/Documents/thesis/data/json/interest_groups_gold.json"
with open(interests_path) as f:
    interest_keywords = json.load(f)

# Run through interests
interest_map = {}

for n, unique_interest in enumerate(df['Interests'].unique()):
    
    # Put unique interest in lower case
    unique_interest = unique_interest.lower()
    
    # Arr to store found topics
    found_topics = {}
    
    # Iterate over set of topics with interest keywords
    for topic, keywords in interest_keywords.iteritems():
        
        for keyword in keywords:
            if keyword in unique_interest:
                if topic not in found_topics:
                    found_topics[topic] = 0
                found_topics[topic] += 1
                
    else:
        interest_map[unique_interest] = found_topics


In [14]:
for interest, topics in interest_map.iteritems():
    
    curr = {
        'count': 0,
        'name': ''
    }
    
    for topic_name, topic_count in topics.iteritems():
        if topic_count > curr['count']:
            curr['count'] = topic_count
            curr['name'] = topic_name
    
    if curr['name'] == '':
        curr['name'] = 'mixed'
    
    interest_map[interest] = curr['name']


In [15]:
for n, e in enumerate(df['Interests']):
    df.at[n,'Interests'] = interest_map[e.lower()]

In [16]:
df.to_csv('~/Documents/thesis/data/csv/fb_gold.csv',index=False)

In [None]:
# # InterestsGroups

# # Load mapped interests grups json file
# interests_path = "/Users/drewnleonard/Documents/thesis/data/json/interest_groups.json"
# with open(interests_path) as f:
#     interests_groups_map = json.load(f)

# for k, v in interests_groups_map.iteritems():
#     v = [e.lower() for e in v]
#     interests_groups_map[k] = set(v)

# interests_group_master = {}
    
# for n, interests in enumerate(df['Interests']):
    
#     # Continue if there are no available interests
#     if interests == 'Unavailable':
#         continue
    
#     # Parse interests into list
#     interests_list = interests.split(',')
    
#     found_interest_groups = {}
    
#     # Iterate over interests in list
#     for interest in interests_list:
        
#         interest = interest.lower()
        
#         # Iterate over mapped groups
#         for interest_group_title, interest_group_keywords in interests_groups_map.iteritems():
            
#             # For each mapped group, iterate over its keywords
#             for keyword in interest_group_keywords:
                    
#                     # If keyword is in interest, record that and break
#                     if keyword in interest:
                        
#                         if interest_group_title not in found_interest_groups:
#                             found_interest_groups[interest_group_title] = 0
                        
#                         # Increment keyword's value 
#                         found_interest_groups[interest_group_title] += 1
                        
#                         break
    
#     curr_group_title = "Unavailable"
#     curr_group_score = 0
    
#     for k, v in found_interest_groups.iteritems():
#         curr_group_title = k if v > curr_group_score else curr_group_title
        
#     df.at[n, 'Interests'] = curr_group_title
                               

In [None]:
# # Remove rows with null values
# remove_rows_for_cols = ['AdText','AdID','CreationDate']
# for col in remove_rows_for_cols:
#     df = df[pd.notnull(df[col])]

# # Reset index after removing rows
# df = df.reset_index(drop=True)

In [None]:
# # Remove rows without valid text
# for n, e in enumerate(df['AdText']):
#     if e and re.match(r'^[_\W]+$', e):
#         df.at[n, 'AdText'] = None

In [None]:
# # Remove rows with null AdText values
# for col in remove_rows_for_cols:
#     df = df[pd.notnull(df['AdText'])]

# # Reset index after removing rows
# df = df.reset_index(drop=True)

In [None]:
# # Scrub texts of social media phrases
# social_media_phrases = ["Subscribe to our channel:","Follow my Facebook:","Follow me on Instagram:","Follow me on Twitter:"]
# for n, e in enumerate(df['AdText']):
#     curr_e = e.lower()
#     curr_e = re.sub(r'http\S+', '', curr_e)
#     [
#         curr_e.replace(phrase, '') for phrase in social_media_phrases
#     ]
#     df.at[n, 'AdText'] = curr_e

In [None]:
# df = df.drop(columns=[
#     'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Language','FriendsOfConnections', 
#     'ExcludedConnections', 'Gender', 'Generation',
#     'Politics', 'CustomAudience', 'SourceFile', 
#     'SourceZip', 'pages', 'Location'
# ])
# df = df.reset_index(drop=True)

In [None]:
# # Fix integer colums with null vals
# integer_cols = ['Clicks','Impressions','AdSpend']
# for integer_col in integer_cols:
#     df[integer_col] = df[integer_col].fillna(0.0)
    
# # Fix string columns with null vals
# string_cols = ["LandingPage", "Interests","AdSpendCurrency"]
# for string_col in string_cols:
#     df[string_col] = df[string_col].fillna("Unavailable")

In [None]:
# # Count null values as percentage for each column
# total_vals = len(df)
# for col in df.columns:
#     null_vals = df[col].isnull().sum()
#     print("{0}: {1}".format(col, float(null_vals)/total_vals))