In [1]:
import pandas as pd
import numpy as np
import unicodedata
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

In [2]:
def basic_clean(somestring):
    '''
    Takes in body text of a README and performs a basic clean by first converting it to all lower case letters,
    then normalizes the encoding, and removes any character that isn't a letter, number, or a space.
    Returns the result.
    '''
    basic = somestring.lower()
    basic = unicodedata.normalize('NFKD', basic).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    basic = re.sub(r"[^a-z0-9'\s]", '', basic)
    return basic

def tokenize(somestring):
    '''
    Creates a tokenizer object to tokenize the given string and then returns the tokenized string
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    tokened = tokenizer.tokenize(somestring, return_str=True)    
    return tokened

def lemmatize(somestring):
    '''
    Creates a lemmatize object and then uses it to lemmatize the provided string. Returns the lemmatized string 
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in somestring.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

def remove_stopwords(string, keep_words=['no', 'not'], exclude_words=[]):
    '''
    This function takes in a string, removes stop_words from the string, and then returns the results.
    The fuction also allows for optional arugments keep_words and exclud_words to modify the stop word list
    * keep_words - a list of words to remove from the standard english stopwords list from nltk.corpus i.e. no or not
    * exclude_words - a list of words to add to the standard english stopwords list from nltk.corpus 
    i.e. ['data', 'science'] to the remove both words when dealing with articles only about data science
    * By default, keep_words includes no and not
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    # Remove 'keep_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(keep_words)
    # Add in 'exclude_words' to stopword_list.
    stopword_list = stopword_list.union(set(exclude_words))
    # Split words in string.
    words = string.split()
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords


def wrangle(facebook_data_csv_location):
    '''
    This function requires that you download and unzip the file from the link below. File size is over 3gigs
    https://www.propublica.org/datastore/dataset/political-advertisements-from-facebook 
    Once unzipped, use the file path in the fuction to:
    * Load the csv into a pandas dataframe
    * Drops the following columns - 'listbuilding_fundraising_proba', 'targetedness', 'targetings', 'targets', 
    'targeting', 'images', 'lang', 'thumbnail', 'html', 'page_id', 'suppressed', 'entities', 'lower_page'
    * Create a socio_political_fb column that is True if facebook believes that the ad is about soical or political topics, 
    and false if facebook does not
    * Combine the advertiser and paid_for_by columns, with a priority given to paid_for_by values into a advert_by column
    * If advert_by column is null and the page is known, replace the null value with the page value
    * Fills remaining null values with 'unk' for advert_by column
    * Drops any row that has a missing title value (47)
    * Drops the remaining columns with null 
    * Normalizes the message columns by conducting basic_clean, tokenization, removing stop words, and lemmatizing the text
    * Splits the data into train, validate, and test sets that are stratified on socio_political_fb
    * returns train, validate, test
    '''

    df = pd.read_csv(facebook_data_csv_location)
    df.drop(columns=['listbuilding_fundraising_proba', 'targetedness', 'targetings', 'targets', 
    'targeting', 'images', 'lang', 'thumbnail', 'html', 'page_id', 'suppressed', 'entities', 'lower_page'], inplace=True)
    df['socio_political_fb'] = df['paid_for_by'].notnull()
    df['advert_by'] = np.where(df.paid_for_by.isnull(), df.advertiser, df.paid_for_by)
    df.advert_by = np.where(df.advert_by.isnull(), df.page, df.advert_by)
    df['advert_by'].fillna('unk', inplace=True)
    df.dropna(subset=['title'], inplace= True)
    df.dropna(axis=1, inplace=True)
    df['message'] = df['message'].str.replace(r'(<.+\")', "")
    df['message'] = df['message'].str.replace("</span>", "")
    df['message'] = df['message'].str.replace("</p>", "")
    df['message'] = df['message'].str.replace("<p>", "")
    df['message'] = df['message'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
    train_validate, test = train_test_split(df, 
                                        stratify=df.socio_political_fb, 
                                        test_size=.2, 
                                        random_state=333)

    train, validate = train_test_split(train_validate, 
                                       stratify=train_validate.socio_political_fb, 
                                       test_size=.25,
                                       random_state=333)
    return train, validate, test

In [3]:
df = pd.read_csv('facebookads.csv')
df.drop(columns=['listbuilding_fundraising_proba', 'targetedness', 'targetings', 'targets', 
'targeting', 'images', 'lang', 'thumbnail', 'html', 'page_id', 'suppressed', 'entities', 'lower_page'], inplace=True)
df['socio_political_fb'] = df['paid_for_by'].notnull()
df['advert_by'] = np.where(df.paid_for_by.isnull(), df.advertiser, df.paid_for_by)
df.advert_by = np.where(df.advert_by.isnull(), df.page, df.advert_by)
df['advert_by'].fillna('unk', inplace=True)
df.dropna(subset=['title'], inplace= True)
df.dropna(axis=1, inplace=True)
df['message'] = df['message'].str.replace(r'(<.+\")', "")
df['message'] = df['message'].str.replace("</span>", "")
df['message'] = df['message'].str.replace("</p>", "")
df['message'] = df['message'].str.replace("<p>", "")
df['message'] = df['message'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
df['title_cleaned'] = df['title'].apply(basic_clean)
df['advert_by_cleaned'] = df['title'].apply(basic_clean)


Unnamed: 0,id,political,not_political,title,message,created_at,updated_at,impressions,political_probability,socio_political_fb,advert_by
0,23844966472130541,1,0,Stop Republicans,election changing rapidly many democrat terrif...,2020-04-28 18:29:51.893295+00,2020-04-28 20:00:18.291895+00,1,0.999563,True,Stop Republicans
1,23844607318170064,0,1,Planned Parenthood,uncertain time shouldnt impact access dependab...,2020-05-03 21:21:17.022063+00,2020-05-03 23:00:14.598025+00,1,0.426290,True,Planned Parenthood Federation of America
2,23844646560350048,0,0,No Kid Hungry,givingtuesdaya happening may 5th response covi...,2020-05-03 21:50:12.097628+00,2020-05-03 23:00:14.717141+00,1,0.625368,True,No Kid Hungry
3,23844775979230727,1,0,Kirsten Gillibrand,want take back senate retire mitch mcconnell s...,2020-04-27 23:44:32.620687+00,2020-04-28 19:48:27.917779+00,3,1.000000,True,Gillibrand for Senate
4,23844529028560107,0,2,Learn How to Eliminate Discomfort Today -->,neck pain caused several factor effectively tr...,2020-03-28 15:38:36.16756+00,2020-04-28 19:52:56.791781+00,3,0.701307,False,unk
...,...,...,...,...,...,...,...,...,...,...,...
222181,23844404769680021,2,0,Shahid Buttar for Congress,housing homeless life working people lowincome...,2020-04-02 15:47:15.164735+00,2020-04-29 10:21:06.0655+00,4,0.999264,True,Shahid Buttar for Congress Committee
222182,6175667699907,0,0,Catholic Relief Services,cr ground preventing responding covid19 outbre...,2020-05-10 20:24:20.711342+00,2020-05-10 22:00:11.252752+00,1,0.687591,True,CATHOLIC RELIEF SERVICES - UNITED STATES CONFE...
222183,23844574679700670,2,0,Elizabeth Warren,running president one elizabeth favorite thing...,2020-04-29 23:30:26.64405+00,2020-04-30 14:00:06.087094+00,1,0.999989,True,"WARREN DEMOCRATS, INC."
222184,23844247053180740,0,1,Be a catalyst for change.,antioch mba feature mutual importance people p...,2020-03-23 23:00:46.971931+00,2020-04-29 23:30:55.279836+00,2,0.825052,False,unk


In [4]:
df.dtypes

id                        object
political                  int64
not_political              int64
title                     object
message                   object
created_at                object
updated_at                object
impressions                int64
political_probability    float64
socio_political_fb          bool
advert_by                 object
dtype: object

In [6]:
df['created_at'] = pd.to_datetime(df['created_at'])
df['updated_at'] = pd.to_datetime(df['updated_at'])
df['known_ad_length'] = df['updated_at'] - df['created_at'] 

df['known_ad_length'] = df['known_ad_length'] // pd.Timedelta('1d')

df['message_length'] = df['message'].str.split().str.len()
df

Unnamed: 0,id,political,not_political,title,message,created_at,updated_at,impressions,political_probability,socio_political_fb,advert_by,known_ad_length,message_length
0,23844966472130541,1,0,Stop Republicans,election changing rapidly many democrat terrif...,2020-04-28 18:29:51.893295+00:00,2020-04-28 20:00:18.291895+00:00,1,0.999563,True,Stop Republicans,0,51
1,23844607318170064,0,1,Planned Parenthood,uncertain time shouldnt impact access dependab...,2020-05-03 21:21:17.022063+00:00,2020-05-03 23:00:14.598025+00:00,1,0.426290,True,Planned Parenthood Federation of America,0,11
2,23844646560350048,0,0,No Kid Hungry,givingtuesdaya happening may 5th response covi...,2020-05-03 21:50:12.097628+00:00,2020-05-03 23:00:14.717141+00:00,1,0.625368,True,No Kid Hungry,0,35
3,23844775979230727,1,0,Kirsten Gillibrand,want take back senate retire mitch mcconnell s...,2020-04-27 23:44:32.620687+00:00,2020-04-28 19:48:27.917779+00:00,3,1.000000,True,Gillibrand for Senate,0,43
4,23844529028560107,0,2,Learn How to Eliminate Discomfort Today -->,neck pain caused several factor effectively tr...,2020-03-28 15:38:36.167560+00:00,2020-04-28 19:52:56.791781+00:00,3,0.701307,False,unk,31,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...
222181,23844404769680021,2,0,Shahid Buttar for Congress,housing homeless life working people lowincome...,2020-04-02 15:47:15.164735+00:00,2020-04-29 10:21:06.065500+00:00,4,0.999264,True,Shahid Buttar for Congress Committee,26,19
222182,6175667699907,0,0,Catholic Relief Services,cr ground preventing responding covid19 outbre...,2020-05-10 20:24:20.711342+00:00,2020-05-10 22:00:11.252752+00:00,1,0.687591,True,CATHOLIC RELIEF SERVICES - UNITED STATES CONFE...,0,19
222183,23844574679700670,2,0,Elizabeth Warren,running president one elizabeth favorite thing...,2020-04-29 23:30:26.644050+00:00,2020-04-30 14:00:06.087094+00:00,1,0.999989,True,"WARREN DEMOCRATS, INC.",0,42
222184,23844247053180740,0,1,Be a catalyst for change.,antioch mba feature mutual importance people p...,2020-03-23 23:00:46.971931+00:00,2020-04-29 23:30:55.279836+00:00,2,0.825052,False,unk,37,9


In [9]:
df['title_cleaned'] = df['title'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
df['advert_by_cleaned'] = df['advert_by'].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)

In [13]:
df.head(5)

Unnamed: 0,id,political,not_political,title,message,created_at,updated_at,impressions,political_probability,socio_political_fb,advert_by,known_ad_length,message_length,title_cleaned,advert_by_cleaned,observed_ad
0,23844966472130541,1,0,Stop Republicans,election changing rapidly many democrat terrif...,2020-04-28 18:29:51.893295+00:00,2020-04-28 20:00:18.291895+00:00,1,0.999563,True,Stop Republicans,0,51,stop republican,stop republican,1
1,23844607318170064,0,1,Planned Parenthood,uncertain time shouldnt impact access dependab...,2020-05-03 21:21:17.022063+00:00,2020-05-03 23:00:14.598025+00:00,1,0.42629,True,Planned Parenthood Federation of America,0,11,planned parenthood,planned parenthood federation america,-1
2,23844646560350048,0,0,No Kid Hungry,givingtuesdaya happening may 5th response covi...,2020-05-03 21:50:12.097628+00:00,2020-05-03 23:00:14.717141+00:00,1,0.625368,True,No Kid Hungry,0,35,no kid hungry,no kid hungry,0
3,23844775979230727,1,0,Kirsten Gillibrand,want take back senate retire mitch mcconnell s...,2020-04-27 23:44:32.620687+00:00,2020-04-28 19:48:27.917779+00:00,3,1.0,True,Gillibrand for Senate,0,43,kirsten gillibrand,gillibrand senate,1
4,23844529028560107,0,2,Learn How to Eliminate Discomfort Today -->,neck pain caused several factor effectively tr...,2020-03-28 15:38:36.167560+00:00,2020-04-28 19:52:56.791781+00:00,3,0.701307,False,unk,31,10,learn eliminate discomfort today,unk,-2


In [19]:
df['full_ad'] = df['message'] + " " + df['title_cleaned']
df['full_ad']

0         election changing rapidly many democrat terrif...
1         uncertain time shouldnt impact access dependab...
2         givingtuesdaya happening may 5th response covi...
3         want take back senate retire mitch mcconnell s...
4         neck pain caused several factor effectively tr...
                                ...                        
222181    housing homeless life working people lowincome...
222182    cr ground preventing responding covid19 outbre...
222183    running president one elizabeth favorite thing...
222184    antioch mba feature mutual importance people p...
222185               little desperate together thank reader
Name: full_ad, Length: 222139, dtype: object

In [20]:
df['full_ad_length'] = df['full_ad'].str.split().str.len()
df[['full_ad_length', 'message_length']]

Unnamed: 0,full_ad_length,message_length
0,53,51
1,13,11
2,38,35
3,45,43
4,14,10
...,...,...
222181,22,19
222182,22,19
222183,44,42
222184,11,9


In [25]:
df['socio_political_fb'] = df['socio_political_fb'].replace({True:'Political', False:'Not Political'})