In [23]:
from datetime import datetime
import pandas as pd
import math
import re

In [24]:
# Load data into pd.df
path = "~/Documents/thesis/data/csv/FacebookAds.csv"
df = pd.read_csv(path)

# Remove rows with null AdText values
df = df[pd.notnull(df['AdText'])]

# Remove immediately unwanted columns
# Either too unavailable or not useful
df = df.drop(columns=[
    'EndDate', 'Behaviors', 'PeopleWhoMatch', 'Placements', 'pages',
    'FriendsOfConnections', 'ExcludedConnections', 'Gender', 'Generation',
    'Politics', 'CustomAudience', 'SourceFile', 'SourceZip', 'Language'
])

# Remove rows will no CreationDate value
df = df[pd.notnull(df['CreationDate'])]

# Reset index after dropping columns
df = df.reset_index(drop=True)

In [25]:
# Fix float columns
integer_cols = ['Clicks','Impressions','AdSpend']
for n, e in df.iterrows():
    for col in integer_cols:
        if math.isnan(e[col]):
            df.at[n,col] = 0.0

In [26]:
# Fid AdText column
for n, e in enumerate(df['AdText']):
    curr_e = re.sub(r'http\S+', '', e)
    curr_e = curr_e.replace('Subscribe to our channel:','')
    curr_e = curr_e.replace('Follow my Facebook:','')
    curr_e = curr_e.replace('Follow me on Instagram:','')
    curr_e = curr_e.replace('Follow me on Twitter:','')
    df.at[n, 'AdText'] = curr_e

In [30]:
# Fix some string cols
string_cols = ['LandingPage','Location','Interests','AdSpendCurrency']
for string_col in string_cols:
    df[string_col] = df[string_col].astype(str)
    for n, e in enumerate(df[string_col]):
        if e == 'nan':
            df.at[n, string_col] = 'unavailable'

In [32]:
interest_keywords = {
    'left wing': [
        'humanitarianism', 'i have a dream', 'anti-racism', 'social justice',
        'wellness', 'liberalism', 'malcolm x', 'human rights', 'democratic',
        'mother jones', 'bernie', 'libertarian'
    ],
    'right wing': [
        'veterans', 'secession', 'patriotism', 'republican', 'tea party',
        'southern', 'donald trump', 'vietnam', 'manufacturing', 'god',
        'ron paul', 'breitbart', 'texas', 'fox news', 'jesus', 'confederate',
        'cato institute', 'evangelicalism', 'right-wing', 'nationalism'
    ],
    'self defense': ['self-defense', 'self defense'],
    'illegal immigration': [
        'stop illegal immigration', 'deportation', 'illegal immigration',
        'immigration'
    ],
    'gun': ['2nd amendment', 'gun rights', 'bear arms', 'second amendment'],
    'black': [
        'BlackNews.com', 'black power', 'cop block', 'police brutality',
        'african american', 'african-american', 'black (color)',
        'pan-africanism', 'martin luther king', 'black', 'african', 'trayvon',
        'racial', 'anti-racism','martin luther king','racism in the united states'
    ],
    'islam': ['islam', 'quran', 'muslims', 'muslim'],
    'police': [
        'law enforcement', 'the badge', 'blue line', 'police or safety',
        'police officer or safety'
    ],
    'hispanic': ['hispanic', 'mexican', 'latin', 'chicano', 'la raza'],
    'native american': ['native american', 'american indian'],
    'music': ['music', 'soundcloud', 'apple music', 'itunes', 'spotify','software'],
    'entertainment': [
        'buzzfeed', '9gag', 'entertainment', 'imgur', 'humour', 'funny',
        'reddit', 'games', 'meme'
    ],
    'lgbt': ['lgbt', 'gay', 'transgender', 'trans', 'homosexuality'],
    'Government':
    ['syria', 'army', 'hillary', 'government', 'obama', 'senate', 'tax', 'political party'],
    'prison': ['prison', 'incarceration', 'correctional', 'inmates'],
    'unavailable': ['unavailable']
}


In [33]:
# Run through interests
interest_map = {}

for n, unique_interest in enumerate(df['Interests'].unique()):
    
    # Put unique interest in lower case
    unique_interest = unique_interest.lower()
    
    # Arr to store found topics
    found_topics = {}
    
    # Iterate over set of topics with interest keywords
    for topic, keywords in interest_keywords.iteritems():
        
        for keyword in keywords:
            if keyword in unique_interest:
                if topic not in found_topics:
                    found_topics[topic] = 0
                found_topics[topic] += 1
                
    else:
        interest_map[unique_interest] = found_topics


In [34]:
for interest, topics in interest_map.iteritems():
    
    curr = {
        'count': 0,
        'name': ''
    }
    
    for topic_name, topic_count in topics.iteritems():
        if topic_count > curr['count']:
            curr['count'] = topic_count
            curr['name'] = topic_name
    
    if curr['name'] == '':
        curr['name'] = 'mixed'
    
    interest_map[interest] = curr['name']


In [35]:
df[df.Interests == 'unavailable'].count()

AdID               787
AdText             787
Clicks             787
Impressions        787
Age                787
CreationDate       787
LandingPage        787
Location           787
Interests          787
AdSpend            787
AdSpendCurrency    787
dtype: int64

In [36]:
for n, e in enumerate(df['Interests']):
    df.at[n,'Interests'] = interest_map[e.lower()]

In [10]:
df.head(100)

Unnamed: 0,AdID,AdText,Clicks,Impressions,Age,CreationDate,LandingPage,Location,Interests,AdSpend,AdSpendCurrency
0,374,Join us because we care. Black matters.,0.0,137.0,18 - 65+,06/10/15 02:59:53 AM PDT,https://www.facebook.com/Black-Matters-1579673...,United States: Baltimore (+20 km) Maryland; St...,unavailable,44.87,RUB
1,655,NOT EVERY BOY WANTS TO BE A SOLDIER. A beautif...,35.0,452.0,18 - 65+,06/23/15 07:04:01 AM PDT,https://www.facebook.com/LGBT-United-839497472...,Living In: United States,unavailable,184.81,RUB
2,664,"""People can tolerate two homosexuals they see ...",26.0,374.0,18 - 65+,06/23/15 07:02:40 AM PDT,https://www.facebook.com/LGBT-United-839497472...,Living In: United States,unavailable,99.95,RUB
3,79,?????? ??? ????? ? ??????????,0.0,31.0,18 - 65+,06/09/15 03:50:21 AM PDT,https://www.facebook.com/pages/L-for-life/9949...,United States,unavailable,33.59,RUB
4,325,California... knows how to party California......,4.0,326.0,18 - 65+,06/10/15 07:34:52 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,45.94,RUB
5,326,"Since 2010, over 350 of our lives have been ta...",517.0,1478.0,18 - 65+,06/12/15 03:13:16 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States Baltimore Maryland: Ferguson, St...",unavailable,99.97,RUB
6,327,"'Just like Trayvon Martin, race mattered for A...",7.0,125.0,18 - 65+,06/11/15 06:51:30 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,34.77,RUB
7,328,Race war started by Texas teacher A Texas four...,17.0,168.0,18 - 65+,06/11/15 07:03:58 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,31.54,RUB
8,329,The image of 1938 shows several African Americ...,18.0,482.0,18 - 65+,06/15/15 07:21:33 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,90.65,RUB
9,330,American Racists On The Road The racist...,24.0,524.0,18 - 65+,06/15/15 07:22:00 AM PDT,https://www.facebook.com/Black-Matters-1579673...,"United States: Baltimore Maryland; Ferguson, S...",unavailable,88.45,RUB
