In [53]:
from datetime import datetime
import pandas as pd
import math
import re

In [54]:
# Load data into pd.df
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

# Remove immediately unwanted columns
# Either too unavailable or not useful
df = df.drop(columns=[
    'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Placements', 'pages',
    'FriendsOfConnections', 'ExcludedConnections', 'Gender', 'Generation',
    'Politics', 'CustomAudience', 'SourceFile', 'SourceZip', 'Language'
])

# Remove rows will no CreationDate value
df = df[pd.notnull(df['CreationDate'])]

# Reset index after dropping columns
df = df.reset_index(drop=True)

In [33]:
# Fix float columns
integer_cols = ['Clicks','Impressions','AdSpend']
for n, e in df.iterrows():
    for col in integer_cols:
        if math.isnan(e[col]):
            df.at[n,col] = 0.0

In [34]:
# Fid AdText column
for n, e in enumerate(df['AdText']):
    curr_e = re.sub(r'http\S+', '', e)
    curr_e = curr_e.replace('Subscribe to our channel:','')
    curr_e = curr_e.replace('Follow my Facebook:','')
    curr_e = curr_e.replace('Follow me on Instagram:','')
    curr_e = curr_e.replace('Follow me on Twitter:','')
    df.at[n, 'AdText'] = curr_e

In [35]:
# Fix some string cols
string_cols = ['LandingPage','Location','Interests','AdSpendCurrency']
for string_col in string_cols:
    df[string_col] = df[string_col].astype(str)
    for n, e in enumerate(df[string_col]):
        if e == 'nan':
            df.at[n, string_col] = 'unavailable'

In [39]:
interest_keywords = {
    'left wing': [
        'humanitarianism', 'i have a dream', 'anti-racism', 'social justice',
        'wellness', 'liberalism', 'malcolm x', 'human rights', 'democratic',
        'mother jones', 'bernie', 'libertarian'
    ],
    'right wing': [
        'veterans', 'secession', 'patriotism', 'republican', 'tea party',
        'southern', 'donald trump', 'vietnam', 'manufacturing', 'god',
        'ron paul', 'breitbart', 'texas', 'fox news', 'jesus', 'confederate',
        'cato institute', 'evangelicalism', 'right-wing', 'nationalism'
    ],
    'self defense': ['self-defense', 'self defense'],
    'illegal immigration': [
        'stop illegal immigration', 'deportation', 'illegal immigration',
        'immigration'
    ],
    'gun': ['2nd amendment', 'gun rights', 'bear arms', 'second amendment'],
    'black': [
        'BlackNews.com', 'black power', 'cop block', 'police brutality',
        'african american', 'african-american', 'black (color)',
        'pan-africanism', 'martin luther king', 'black', 'african', 'trayvon',
        'racial', 'anti-racism','martin luther king','racism in the united states'
    ],
    'islam': ['islam', 'quran', 'muslims', 'muslim'],
    'police': [
        'law enforcement', 'the badge', 'blue line', 'police or safety',
        'police officer or safety'
    ],
    'hispanic': ['hispanic', 'mexican', 'latin', 'chicano', 'la raza'],
    'native american': ['native american', 'american indian'],
    'music': ['music', 'soundcloud', 'apple music', 'itunes', 'spotify','software'],
    'entertainment': [
        'buzzfeed', '9gag', 'entertainment', 'imgur', 'humour', 'funny',
        'reddit', 'games', 'meme'
    ],
    'lgbt': ['lgbt', 'gay', 'transgender', 'trans', 'homosexuality'],
    'mainstream':
    ['syria', 'army', 'hillary', 'government', 'obama', 'senate', 'tax', 'political party'],
    'prison': ['prison', 'incarceration', 'correctional', 'inmates'],
    'unavailable': ['unavailable']
}


In [45]:
# Run through interests
interest_map = {}

for n, unique_interest in enumerate(df['Interests'].unique()):
    
    # Put unique interest in lower case
    unique_interest = unique_interest.lower()
    
    # Arr to store found topics
    found_topics = {}
    
    # Iterate over set of topics with interest keywords
    for topic, keywords in interest_keywords.iteritems():
        
        for keyword in keywords:
            if keyword in unique_interest:
                if topic not in found_topics:
                    found_topics[topic] = 0
                found_topics[topic] += 1
                
    else:
        interest_map[unique_interest] = found_topics


In [46]:
for interest, topics in interest_map.iteritems():
    
    curr = {
        'count': 0,
        'name': ''
    }
    
    for topic_name, topic_count in topics.iteritems():
        if topic_count > curr['count']:
            curr['count'] = topic_count
            curr['name'] = topic_name
    
    if curr['name'] == '':
        curr['name'] = 'mixed'
    
    interest_map[interest] = curr['name']


In [47]:
for n, e in enumerate(df['Interests']):
    df.at[n,'Interests'] = interest_map[e.lower()]