## Import Statements

In [1]:
import glob
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

## Reading Data in

In [2]:
def time_of_day_creation(x):
    '''
    Takes in an integer and returns one of four time slots:
    ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    '''
    
    slots_out = ['created_12am-6am', 'created_6am-12pm', 'created_12pm-6pm', 'created_6pm-12am']
    slot0 = np.arange(0,6)
    slot1 = np.arange(6,12)
    slot2 = np.arange(12,18)
    slot3 = np.arange(18,24)

    if x in slot0:
        return slots_out[0]
    elif x in slot1:
        return slots_out[1]
    elif x in slot2:
        return slots_out[2]
    elif x in slot3:
        return slots_out[3]
    else:
        return 'NaN'

In [3]:
def calculateRollupPosts(social_media_df,social_media_type, social_media_handle_field, artists_list_dir):
    time_range = pd.date_range(start='1/1/2016', end='6/30/2019', freq='d').to_frame(index=False)
    time_range.rename(columns={0: 'Time Period'}, inplace=True)
    master_artists_list = pd.read_csv(artists_list_dir, usecols=[social_media_type])
    time_range['join_key'] = 0
    master_artists_list['join_key'] = 0
    artists_time_range = master_artists_list.merge(time_range, how='left', on = 'join_key')
    artists_time_range.drop('join_key',1, inplace=True)
    artists_time_range.rename(columns={0: 'Time Period'}, inplace=True)
    social_media_df['Created_Day'] = social_media_df.created.dt.floor('d')
    df_aggregate = social_media_df.groupby([social_media_handle_field, 'Created_Day'], as_index=False).agg({"created": "count"})
    df_aggregate.rename(columns={'created': 'Posts In Day'}, inplace=True)
    df_merged = artists_time_range.merge(df_aggregate, how='left', left_on=[social_media_type,'Time Period'], right_on=[social_media_handle_field,'Created_Day'])
    df_merged['Posts In Day'] = df_merged['Posts In Day'].fillna(0)
    df_merged['last_7_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(7, min_periods=1).sum())
    df_merged['last_30_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(30, min_periods=1).sum())
    df_merged.drop(columns=[social_media_handle_field, 'Created_Day','Posts In Day'], inplace=True)
    social_media_df = social_media_df.merge(df_merged, how='left', right_on=[social_media_type,'Time Period'], left_on=[social_media_handle_field,'Created_Day'])
    social_media_df.drop(columns=[social_media_type, 'Time Period','Created_Day'], inplace=True)
    
    return social_media_df

In [4]:
def read_and_clean_facebook(facebook_directory, artist_list_directory):
    '''
    Reads in all Facebook files from a directory of your choice (facebook_directory) into a pandas dataframe.
    Reads the master artist list in from the (artist_list_directory) of your choice.
    
    Filters the dataframe to necessary columns. 
    Converts columns into appropriate data types.
    Creates new custom columns based on current columns.
    Creates dummy features on categorical columns.
    
    Returns a dataframe that can then be exported, used for Exploratory Data Analysis, modeling, or 
    combination with other social media platform data that shares the same format (Twitter, Instagram, Youtube). 
    '''
    
    #Loads the Facebook files into a list of files and concatenates into a dataframe
    files = glob.glob(facebook_directory)
    df_list = [pd.read_csv(file, encoding="ISO-8859-1") for file in files]
    df = pd.concat(df_list)
    
    #Field descriptions are available here: https://github.com/CrowdTangle/API/wiki/Post
    
    #Reduce the number of features in the dataframe
    selected_columns = ['User Name', 'Page Likes at Posting', 'Created', 'Type', 
                        'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 
                        'Angry', 'Thankful', 'Message', 'Score']
    df = df[selected_columns]
    
    #Convert the date of posting to datetime
    df['Created'] = pd.to_datetime(df['Created']).dt.tz_localize(None)

    #Reset the index
    df.reset_index(inplace=True)
    
    #Convert Message Description to a string
    df['Description'] = df['Message'].astype(str)
    
    #Create custom feature that counts the number of characters in a post
    df['description_length'] = df['Description'].apply(len)
    
    #Create custom feature that counts the number of hashtags in a post
    df['hashtag_count'] = df['Description'].apply(lambda x : x.count('#'))
    
    #Creates a custom feature that calculates the Social Engagement Score for Facebook
    df['social_engagement_score'] = (df['Likes'] + df['Comments'] + df['Shares'] + df['Love'] + df['Wow'] + df['Haha'] + df['Sad'] + df['Angry'] + df['Thankful']) / df['Page Likes at Posting']
    
    #These are the types of Facebook posts
    types = ['Photo', 'Link', 'Status', 'Native Video', 'YouTube', 'Video', 'Live Video Complete', 'Vine', 'Live Video', 'Live Video Scheduled']
    #Combine Photo and Album into one type
    df.loc[df.Type == 'Status', 'Type'] = "Text"
    df.loc[df.Type.isin(['Native Video', 'YouTube', 'Video', 'Live Video Complete', 'Vine', 'Live Video', 'Live Video Scheduled']),  'Type'] = "Video"
    
    #Creates dummy features for the Type column
    df = pd.concat([df,pd.get_dummies(df['Type'],prefix='type')],axis=1)
    
    #Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
    df['time_of_day'] = df['Created'].apply(lambda x : time_of_day_creation(x.hour))
        
    #Creates dummy features for the time of day column
    df = pd.concat([df,pd.get_dummies(df['time_of_day'])],axis=1)
      
    #Creates new feature to calculate the time between this post and the previous post, by user
    df.sort_values(by=['User Name','Created'], inplace=True)
    df['time_since_last_post'] = df.groupby('User Name')['Created'].diff()
    
    #Dropping columns no longer required
    df.drop(['Message', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Thankful','time_of_day','index', 'Type'],axis=1,inplace=True)    
    
    #Rename the columns to align with format used in master dataframe (Youtube+Twitter+Instagram+Facebook)
    df.rename(columns={'Description':'description', 'Score': 'crowdtangle_score',
                           'Page Likes at Posting': 'count_of_followers', 'User Name':'artist_name',
                           'Created': 'created', 'type_Video': 'type_video','type_Photo':'type_photo',
                           'type_Link':'type_link', 'type_Text':'type_text'
                           },inplace=True)
    
    #Placeholder columns
    df['within_week_release'] = 0 
    df['within_month_release'] = 0 
    
    #Creates features to count the number of posts in last 7,30 days 
    df = calculateRollupPosts(df,'facebook','artist_name',artist_list_directory)
    
    #Drops artists that are not used in this version of music analysis
    artists_to_remove = ['carlosvives', 'mirandalambert', 'bradpaisley', 'thetimmcgraw',
                     'chrisyoungmusic', 'lukecombs', 'willienelsonofficial']
    df = df[~df['artist_name'].isin(artists_to_remove)]
    
    
    return df

In [5]:
df_out = read_and_clean_facebook('FacebookData/*.csv', 'artist_list_directory/master_artists_list.csv')

### @Gaurij, you only need to copy code from above for integration into the master file

In [6]:
df_out.head().T

Unnamed: 0,0,1,2,3,4
artist_name,21Savage,21Savage,21Savage,21Savage,21Savage
count_of_followers,,,,,
created,2015-05-18 15:24:23,2015-05-25 12:00:00,2015-05-25 18:26:12,2015-08-14 12:00:00,2015-09-05 12:00:00
crowdtangle_score,-354.2,-16.5,-62.3,-12.26,-24.89
description,Slaughter Tape coming 5.25.15,"The Slaughter Tape released May 25, 2015.",s/o The FADER for fuckin wit me http://www.the...,21 Savage with OPB & Slaughter Gang members be...,21 Savage performs during Speakerfoxxx's set a...
description_length,29,41,107,75,100
hashtag_count,0,0,0,0,0
social_engagement_score,,,,,
type_link,0,0,1,0,0
type_photo,0,1,0,1,1


In [7]:
#Returns the file output as a pickle file.
df_out.to_pickle('facebook_7yrs_cleaned.pkl')

### Test Import of OLD pickle

In [8]:
df_in = pd.read_csv('TestData/facebook_7yrs_cleaned.csv')

In [17]:
df_in.head().T

Unnamed: 0,0,1,2,3,4
fb_handle_artist_name,21Savage,21Savage,21Savage,21Savage,21Savage
created,2015-05-18 15:24:23,2015-05-25 12:00:00,2015-05-25 18:26:12,2015-08-14 12:00:00,2015-09-05 12:00:00
description,Slaughter Tape coming 5.25.15,"The Slaughter Tape released May 25, 2015.",s/o The FADER for fuckin wit me http://www.the...,21 Savage with OPB & Slaughter Gang members be...,21 Savage performs during Speakerfoxxx's set a...
crowdtangle_score,-354.2,-16.5,-62.3,-12.26,-24.89
count_of_followers,,,,,
type_link,0,0,1,0,0
type_photo,0,1,0,1,1
type_text,1,0,0,0,0
type_video,0,0,0,0,0
description_length,29,41,107,75,100
