This notebook combines all the raw social media data and creates a single formatted notebook to be used for further processing. The output of the notebook is a pickle file.

## Import Statements

In [1]:
# Imports
import glob
import pandas as pd
from collections import Counter
import numpy as np
import time
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.preprocessing import OneHotEncoder
import warnings
from datetime import datetime, date
warnings.filterwarnings('ignore')

## Helper Functions

In [2]:
def createFileFileLocationDict():
    '''
        This function creates a dictionary that stores the mapping between 
        the file and its location. It assumes the base directory for the file
        path to be same as the current directory from where the python notebook is
        being run
    '''
    filemap = {}
    filemap['master_artist_list'] = './Data/RawData/Artists/master_artists_list.csv'
    filemap['facebook_data'] = './Data/RawData/Facebook/*.csv'
    filemap['twitter_data'] = './Data/RawData/Twitter/*.csv'
    filemap['instagram_data'] = './Data/RawData/Instagram/*.csv'
    filemap['youtube_data'] = './Data/RawData/Youtube/youtube_to_combine.csv'
    filemap['music_brainz_data'] = './Data/RawData/MusicBrainz/musicbrainz_data.csv'
    
    return filemap

In [3]:
def readMasterArtistFile(master_artist_file_location):
    '''
        This function takes the master artist file location
        as a parameter and returns a dataframe with the 
        contents of the master artists file
    '''
    # Read master artists file and store in data frame
    artists = pd.read_csv(master_artist_file_location,encoding="ISO-8859-1")
    
    return artists

In [4]:
def time_of_day_creation(x):
    '''
    Takes in an integer and returns one of four time slots:
    ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    '''
    
    slots_out = ['created_12am-6am', 'created_6am-12pm', 'created_12pm-6pm', 'created_6pm-12am']
    slot0 = np.arange(0,6)
    slot1 = np.arange(6,12)
    slot2 = np.arange(12,18)
    slot3 = np.arange(18,24)

    if x in slot0:
        return slots_out[0]
    elif x in slot1:
        return slots_out[1]
    elif x in slot2:
        return slots_out[2]
    elif x in slot3:
        return slots_out[3]
    else:
        return 'NaN'

In [5]:
def calculateRollupPosts(social_media_df,social_media_type, social_media_handle_field, master_artist_file):
    '''
        This function takes 3 parameters:
        1. The dataframe containing raw data of a social platform
        2. The type pf social media account (facebook, twitter, instagram or youtube)
        3. The field name that stores the social media handle of the platform
        
        The function computes the aggregated number of posts by the artist on the platform in the last 7 days and last 30 days
        
        The funcion outputs the modified dataframe with the aggregated number of posts
    '''
    
    time_range = pd.date_range(start='1/1/2016', end='6/30/2019', freq='d').to_frame(index=False)
    time_range.rename(columns={0: 'Time Period'}, inplace=True)
    master_artists_list = pd.read_csv(master_artist_file, usecols=[social_media_type])
    time_range['join_key'] = 0
    master_artists_list['join_key'] = 0
    artists_time_range = master_artists_list.merge(time_range, how='left', on = 'join_key')
    artists_time_range.drop('join_key',1, inplace=True)
    artists_time_range.rename(columns={0: 'Time Period'}, inplace=True)
    social_media_df['Created_Day'] = social_media_df.created.dt.floor('d')
    df_aggregate = social_media_df.groupby([social_media_handle_field, 'Created_Day'], as_index=False).agg({"created": "count"})
    df_aggregate.rename(columns={'created': 'Posts In Day'}, inplace=True)
    df_merged = artists_time_range.merge(df_aggregate, how='left', left_on=[social_media_type,'Time Period'], right_on=[social_media_handle_field,'Created_Day'])
    df_merged['Posts In Day'] = df_merged['Posts In Day'].fillna(0)
    df_merged['last_7_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(7, min_periods=1).sum())
    df_merged['last_30_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(30, min_periods=1).sum())
    df_merged.drop(columns=[social_media_handle_field, 'Created_Day','Posts In Day'], inplace=True)
    social_media_df = social_media_df.merge(df_merged, how='left', right_on=[social_media_type,'Time Period'], left_on=[social_media_handle_field,'Created_Day'])
    social_media_df.drop(columns=[social_media_type, 'Time Period','Created_Day'], inplace=True)
    
    return social_media_df

In [6]:
def read_and_clean_facebook(facebook_directory, master_artist_file):
    '''
    Reads in all Facebook files from Data/RawData/Facebook folder into pandas dataframe.
    Reads the master artist list from Data/RawData/Artists folder.
    
    Filters the dataframe to necessary columns. 
    Converts columns into appropriate data types.
    Creates new custom columns based on current columns.
    Creates dummy features on categorical columns.
    
    Returns a dataframe that can then be exported, used for Exploratory Data Analysis, modeling, or 
    combination with other social media platform data that shares the same format (Twitter, Instagram, Youtube). 
    '''
    
    #Loads the Facebook files into a list of files and concatenates into a dataframe
    files = glob.glob(facebook_directory)
    df_list = [pd.read_csv(file, encoding="ISO-8859-1") for file in files]
    df = pd.concat(df_list)
    
    #Field descriptions are available here: https://github.com/CrowdTangle/API/wiki/Post
    
    #Reduce the number of features in the dataframe
    selected_columns = ['User Name', 'Page Likes at Posting', 'Created', 'Type', 
                        'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 
                        'Angry', 'Thankful', 'Message', 'Score']
    df = df[selected_columns]
    
    #Convert the date of posting to datetime
    df['Created'] = pd.to_datetime(df['Created']).dt.tz_localize(None)

    #Convert Message Description to a string
    df['Description'] = df['Message'].astype(str)
    
    #Create custom feature that counts the number of characters in a post
    df['description_length'] = df['Description'].apply(len)
    
    #Create custom feature that counts the number of hashtags in a post
    df['hashtag_count'] = df['Description'].apply(lambda x : x.count('#'))
    
    #Creates a custom feature that calculates the Social Engagement Score for Facebook
    df['social_engagement_score'] = (df['Likes'] + df['Comments'] + df['Shares'] + df['Love'] + df['Wow'] + df['Haha'] + df['Sad'] + df['Angry'] + df['Thankful']) / df['Page Likes at Posting']
    
    #These are the types of Facebook posts
    types = ['Photo', 'Link', 'Status', 'Native Video', 'YouTube', 'Video', 'Live Video Complete', 'Vine', 'Live Video', 'Live Video Scheduled']
    #Combine Photo and Album into one type
    df.loc[df.Type == 'Status', 'Type'] = "Text"
    df.loc[df.Type.isin(['Native Video', 'YouTube', 'Video', 'Live Video Complete', 'Vine', 'Live Video', 'Live Video Scheduled']),  'Type'] = "Video"
    
    #Creates dummy features for the Type column
    df = pd.concat([df,pd.get_dummies(df['Type'],prefix='type')],axis=1)
    
    #Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
    df['time_of_day'] = df['Created'].apply(lambda x : time_of_day_creation(x.hour))
        
    #Creates dummy features for the time of day column
    df = pd.concat([df,pd.get_dummies(df['time_of_day'])],axis=1)
      
    #Creates new feature to calculate the time between this post and the previous post, by user
    df.sort_values(by=['User Name','Created'], inplace=True)
    df['time_since_last_post'] = df.groupby('User Name')['Created'].diff()
    
    #Dropping columns no longer required
    df.drop(['Message', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Thankful','time_of_day', 'Type'],axis=1,inplace=True)    
    
    #Rename the columns to align with format used in master dataframe (Youtube+Twitter+Instagram+Facebook)
    df.rename(columns={'Description':'description', 'Score': 'crowdtangle_score',
                           'Page Likes at Posting': 'count_of_followers', 'User Name':'artist_name',
                           'Created': 'created', 'type_Video': 'type_video','type_Photo':'type_photo',
                           'type_Link':'type_link', 'type_Text':'type_text'
                           },inplace=True)
    
    #Placeholder columns
    df['within_week_release'] = 0 
    df['within_month_release'] = 0 
    
    # Columns to indicate type of social platform
    df['Facebook'] = 1
    df['Instagram'] = 0
    df['Twitter'] = 0
    df['Youtube'] = 0
    
    #Creates features to count the number of posts in last 7,30 days 
    df = calculateRollupPosts(df,'facebook','artist_name',master_artist_file)
    
    #Drops artists that are not used in this version of music analysis
    artists_to_remove = ['carlosvives', 'mirandalambert', 'bradpaisley', 'thetimmcgraw',
                     'chrisyoungmusic', 'lukecombs', 'willienelsonofficial']
    df = df[~df['artist_name'].isin(artists_to_remove)]
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [7]:
def read_and_clean_instagram(instagram_directory, master_artist_file):
    '''
    Reads in all Instagram files from Data/RawData/Instagram into a pandas dataframe.
    Reads the master artist list from Data/RawData/Artists folder
    
    Filters the dataframe to necessary columns. 
    Converts columns into appropriate data types.
    Creates new custom columns based on current columns.
    Creates dummy features on categorical columns.
    
    Returns a dataframe that can then be exported, used for Exploratory Data Analysis, modeling, or 
    combination with other social media platform data that shares the same format (Twitter, Facebook, Youtube). 
    '''
    
    #Loads the Instagram files into a list of files and concatenates into a dataframe
    files = glob.glob(instagram_directory)
    df_list = [pd.read_csv(file,encoding="ISO-8859-1") for file in files]
    df = pd.concat(df_list)
    
    #Field descriptions are available here: https://github.com/CrowdTangle/API/wiki/Post
    
    #Reduce the number of features in the dataframe
    selected_columns = ['User Name', 'Followers at Posting', 'Created', 'Type', 
                        'Likes', 'Comments', 'Views', 'Description','Score']
    df = df[selected_columns]
    
    #Convert the date of posting to datetime
    df['Created'] = pd.to_datetime(df['Created']).dt.tz_localize(None)
    
    #Convert Description to a string
    df['Description'] = df['Description'].astype(str)
    
    #Create custom feature that counts the number of characters in a post
    df['description_length'] = df['Description'].apply(len)
    
    #Create custom feature that counts the number of hashtags in a post
    df['hashtag_count'] = df['Description'].apply(lambda x : x.count('#'))
    
    #Creates a custom feature that calculates the Social Engagement Score for Instagram
    df['social_engagement_score'] = (df['Likes'] + df['Comments']) / df['Followers at Posting']
    
    #These are the three types of Instagram posts
    types = ['Photo', 'Video', 'Album']
    #Combine Photo and Album into one type
    df.loc[df.Type == 'Album', 'Type'] = "Photo"
    
    #Creates dummy features for the Type column
    df = pd.concat([df,pd.get_dummies(df['Type'],prefix='type')],axis=1)
    
    #Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
    df['time_of_day'] = df['Created'].apply(lambda x : time_of_day_creation(x.hour))
        
    #Creates dummy features for the time of day column
    df = pd.concat([df,pd.get_dummies(df['time_of_day'])],axis=1)
      
    #Creates new feature to calculate the time between this post and the previous post, by user
    df.sort_values(by=['User Name','Created'], inplace=True)
    df['time_since_last_post'] = df.groupby('User Name')['Created'].diff()
    
    #Dropping columns no longer required
    df.drop(['Likes','Comments','Views','time_of_day','Type'],axis=1,inplace=True)    
    
    #Rename the columns to align with format used in master dataframe (Youtube+Twitter+Instagram+Facebook)
    df.rename(columns={'Description':'description', 'Score': 'crowdtangle_score',
                           'Followers at Posting': 'count_of_followers', 'User Name':'artist_name',
                           'Created': 'created', 'type_Video': 'type_video','type_Photo':'type_photo'
                           },inplace=True)
    
    #Placeholder columns
    df['type_text'] = 0
    df['type_link'] = 0
    df['within_week_release'] = 0 
    df['within_month_release'] = 0 
    
    # Columns to indicate type of social platform
    df['Facebook'] = 0
    df['Instagram'] = 1
    df['Twitter'] = 0
    df['Youtube'] = 0
    
    #Creates features to count the number of posts in last 7,30 days 
    df = calculateRollupPosts(df,'instagram','artist_name',master_artist_file)
    
    #Drops artists that are not used in this version of music analysis
    artists_to_remove = ['carlosvives', 'mirandalambert', 'bradpaisley', 'thetimmcgraw',
                     'chrisyoungmusic', 'lukecombs', 'willienelsonofficial']
    df = df[~df['artist_name'].isin(artists_to_remove)]
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [8]:
def read_and_clean_twitter(twitter_directory, master_artist_file):
    '''
    Reads in all Twitter files from Data/RawData/Twitter into a pandas dataframe.
    Reads the master artist list from Data/RawData/Artists folder
    
    Filters the dataframe to necessary columns. 
    Converts columns into appropriate data types.
    Creates new custom columns based on current columns.
    Creates dummy features on categorical columns.
    
    Returns a dataframe that can then be exported, used for Exploratory Data Analysis, modeling, or 
    combination with other social media platform data that shares the same format (Twitter, Facebook, Youtube). 
    '''
    
    #Loads the Instagram files into a list of files and concatenates into a dataframe
    files = glob.glob(twitter_directory)
    df_list = [pd.read_csv(file,encoding="ISO-8859-1") for file in files]
    df = pd.concat(df_list)
    
    #Field descriptions are available here: https://github.com/CrowdTangle/API/wiki/Post
    
    #Reduce the number of features in the dataframe
    selected_columns = ['User Name', 'Followers at Posting', 'Created', 'Type', 
                        'Likes', 'Retweets','Message','Score']
    df = df[selected_columns]
    df = df.dropna()
    df = df.reset_index(drop = True)
    
    #Convert the date of posting to datetime
    df['Created'] = pd.to_datetime(df['Created']).dt.tz_localize(None)
    
    #Reset the index
    df.reset_index(inplace=True)
    
    #Number of characters in message custom feature
    df['message_length'] = df['Message'].apply(len)
    
    #Number of hashtags custom feature
    df['hashtag_count'] = df.apply(lambda x: x['Message'].count('#'), axis=1)
    
    # Reclassify some of the post types
    df.loc[df['Type'] == 'Vine', 'Type'] = 'Video'
    df.loc[df['Type'] == 'Native Video', 'Type'] = 'Video'
    
    #OHE for Type Field
    df = pd.concat([df,pd.get_dummies(df['Type'],prefix='type')],axis=1)
    
    #Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
    df['time_of_day'] = df['Created'].apply(lambda x : time_of_day_creation(x.hour))

    #OHE for time of day field
    df = pd.concat([df,pd.get_dummies(df['time_of_day'],prefix='tod')],axis=1)

    df.sort_values(by=['User Name','Created'], inplace=True)
    
    #Time between prior and current post
    df['time_since_last_post'] = df.groupby('User Name')['Created'].diff()
    
    df['within_week_release'] = 0
    df['within_month_release'] = 0
    
    # Columns to indicate type of social platform
    df['Facebook'] = 0
    df['Instagram'] = 0
    df['Twitter'] = 1
    df['Youtube'] = 0
    
    # Calculating social engagement score
    df['social_engagement_score'] = df.apply(lambda x: (x['Likes'] + x['Retweets']) / x['Followers at Posting'], axis=1)
    
    df = df.drop(['Type','Likes', 'Retweets','time_of_day','index'], axis=1)
    
    # Rename columns
    df.rename(columns={"Created": "created",
                            "Message": "description",
                            "Score":"crowdtangle_score", 
                            "Followers at Posting" : "count_of_followers",
                            "tod_12am-6am" : "created_12am-6am",
                            "tod_6am-12pm" : "created_6am-12pm",
                            "tod_12pm-6pm" : "created_12pm-6pm",
                            "tod_6pm-12am" : "created_6pm-12am",
                            "User Name" : "artist_name",
                            "message_length":"description_length",
                            "type_Photo":"type_photo",
                            "type_Tweet":"type_text",
                            "type_Video":"type_video",
                            "type_Link":"type_link"
                           }, inplace=True)
    
    # Calculate number of posts by artist on the platform in last 7 and 30 days
    df = calculateRollupPosts(df,'twitter','artist_name',master_artist_file)
    
    #Removing the 7 artists
    artists_to_remove = ['carlosvives', 'mirandalambert', 'BradPaisley', 'TheTimMcGraw',
                     'ChrisYoungMusic', 'lukecombs', 'WillieNelson']
    df = df[~df['artist_name'].isin(artists_to_remove)]
    
    df.reset_index(inplace=True, drop=True)
    
    return df

In [9]:
def read_and_clean_youtube(youtube_directory):
    '''
        This function reads the cleaned version of the youtube file
        and computes the social enagagement score for Youtube
    '''
    
    youtube_df = pd.read_csv(youtube_directory,encoding="ISO-8859-1")
    youtube_df = youtube_df.drop("Unnamed: 0",axis=1)
    youtube_df = youtube_df.drop("Unnamed: 0.1",axis=1)
    youtube_df = youtube_df.drop("hour",axis=1)
    
    youtube_df.rename(columns={"artist_name_x":"artist_name"}, inplace=True)
    
    # Columns to indicate type of social platform
    youtube_df['Facebook'] = 0
    youtube_df['Instagram'] = 0
    youtube_df['Twitter'] = 0
    youtube_df['Youtube'] = 1
    
    #Removing the 7 artists
    artists_to_remove = ['carlosvives', 'mirandalambert', 'BradPaisley', 'TheTimMcGraw',
                     'ChrisYoungMusic', 'lukecombs', 'WillieNelson']
    youtube_df = youtube_df[~youtube_df['artist_name'].isin(artists_to_remove)]
    
    youtube_df.reset_index(inplace=True, drop=True)
    
    return youtube_df

In [10]:
# Get the social platform name for each dataframe row
def getSocialPlatform(x):
    '''
        Function to return the social platform of the post
    '''
    if x['Facebook'] == 1:
        return 'Facebook'
    elif x['Instagram'] == 1:
        return 'Instagram'
    elif x['Twitter'] == 1:
        return 'Twitter'
    else:
        return 'Youtube'

In [11]:
from datetime import datetime
def days_between(d1, d2):
    '''
        Function to find days between 2 datetimes
    '''
    d1 = datetime.strptime(d1, "%Y-%m-%d")
    d2 = datetime.strptime(d2, "%Y-%m-%d")
    return abs((d2 - d1).days)

In [12]:
# Function to find the minimum difference of days w.r.t. all release dates of an artist
def find_min_diff(artist_name,created,artist_dict):
    '''
        Function to find the minimum difference of days w.r.t. all release dates of an artist
    '''
    lst = artist_dict[artist_name]
    min_days = min([days_between(x.strftime('%Y-%m-%d'),created.strftime('%Y-%m-%d')) for x in lst])
    return min_days

In [13]:
def calculate_youtube_score(df):
    '''
        Calculates the social engagement score for YouTube
        This function takes the combined social data dataframe
        and updates the social engagement field for YouTube data
    '''
    df = df.sort_values(by=['artist_name_x','created'])
    for i in range(1,len(df)):
        if (i+1)%200==0:
            print(str(i+1)+ ' has been completed!')
        facebook = 0
        fflag = False
        twitter = 0
        tflag = False
        ins = 0
        insflag = False
        if df['Youtube'][i] == 1:
            for j in range(i-1,-1,-1):
                if df['artist_name_x'][i]!=df['artist_name_x'][j]:
                    break
                if fflag & tflag & insflag:
                    break
                if (not fflag) & (df['Facebook'][j]==1):
                    fflag=True
                    facebook = df["count_of_followers"][j]
                if (not tflag) & (df['Twitter'][j]==1):
                    tflag=True
                    twitter = df["count_of_followers"][j]
                if (not insflag) & (df['Instagram'][j]==1):
                    insflag=True
                    ins = df["count_of_followers"][j]
            cal = np.max([facebook,twitter,ins])
            if np.isnan(np.max([facebook,twitter,ins])):
                cal = 0
            if cal != 0:
                print('Calculated: '+ str(cal))
            df['count_of_followers'][i]=cal
            try:
                df['social_engagement_score'][i]=df['social_engagement_score'][i]/df['count_of_followers'][i]
            except:
                df['social_engagement_score'][i] = 0
    return df

## Main Function

In [14]:
from datetime import datetime
def createCombinedSocialMediaFile():
    '''
        This is the main function of the notebook that creates a combined
        social media data pickle file
    '''
    
    print("Starting the process to combine the social media data...")
    
    # Create a dictionary to store file names and their locations
    file_file_location_dict = createFileFileLocationDict()
    
    print("Reading master artists file...")
    
    # Read the master artists file
    artists = readMasterArtistFile(file_file_location_dict['master_artist_list'])
    
    print("Reading and cleaning Facebook Data...")
    # Read and clean facebook data and then merge it with master artist file
    fb_data = read_and_clean_facebook(file_file_location_dict['facebook_data'], file_file_location_dict['master_artist_list'])
    fb_data = fb_data.merge(artists, left_on='artist_name', right_on='facebook')
    print('Number of facebook records' + str(fb_data.shape))
    
    print("Reading and cleaning Instagram Data...")
    # Read and clean instagram data and then merge it with master artist file
    instagram_data = read_and_clean_instagram(file_file_location_dict['instagram_data'], file_file_location_dict['master_artist_list'])
    instagram_data = instagram_data.merge(artists, left_on='artist_name', right_on='instagram')
    print('Number of instagram records' + str(instagram_data.shape))
    
    print("Reading and cleaning Twitter Data...")
    # Read and clean twitter data and then merge it with master artist file
    twitter_data = read_and_clean_twitter(file_file_location_dict['twitter_data'], file_file_location_dict['master_artist_list'])
    twitter_data = twitter_data.merge(artists, left_on='artist_name', right_on='twitter')
    print('Number of twitter records' + str(twitter_data.shape))
    
    print("Reading and cleaning Youtube Data...")
    # Read and clean youtube data and then merge it with combined social data dataframe
    youtube_data = read_and_clean_youtube(file_file_location_dict['youtube_data']) 
    youtube_data = youtube_data.merge(artists, left_on='artist_name', right_on='youtube')
    print('Number of youtube records' + str(youtube_data.shape))
    
    print("Combining Social Media Data...")
    # Combine all the data
    df = pd.concat([fb_data,instagram_data,twitter_data,youtube_data])
    
    # Remove records prior to 2016
    start_date = pd.Timestamp('2016-01-01')
    end_date = pd.Timestamp('2019-06-30')
    df['created'] = pd.to_datetime(df['created'])
    df = df[df['created'].between(start_date, end_date)]
    print('Number of records in dataframe :' + str(df.shape[0]))
    
    # Reset index
    df.reset_index(inplace=True, drop=True)
    
    print("Calculating Social Engagement Score for YouTube")
    # calculate youtube social engagement score here
    df = calculate_youtube_score(df)
    
    # Create social platform column
    df['social_platform'] = df.apply(lambda x: getSocialPlatform(x), axis = 1)
    
    # Filtering outliers in FB data
    df = df[~((df['Facebook'] == 1) & (df['social_engagement_score'] > 1))]
    
    print("Integrating MusicBrainz Data...")
    # Get the mean & standard deviation of social engagement scores per platform and store the results in 3 dictionaries
    dict_platform_mean = df.groupby('social_platform')['social_engagement_score'].mean().to_dict()
    dict_platform_std = df.groupby('social_platform')['social_engagement_score'].std().to_dict()
    
    # Normalize the social engagement scores
    df['normalized_social_engagement_score'] = df.apply(lambda x: (x['social_engagement_score'] - dict_platform_mean[x['social_platform']]) / dict_platform_std[x['social_platform']],axis=1)
    
    # Reading music Brainz data and removing unwanted column
    df_musicbrainz = pd.read_csv(file_file_location_dict['music_brainz_data'],encoding="ISO-8859-1")
    df_musicbrainz = df_musicbrainz.drop("Unnamed: 0",axis=1)
    
    # Converting the type of release_date to datettime for date comparisons
    df_musicbrainz['release_date'] = pd.to_datetime(df_musicbrainz['release_date'])
    
    # Filter the release dates before Jan 1st 2006
    df_musicbrainz = df_musicbrainz[df_musicbrainz['release_date'] > date(2006,1,1)]
    
    # Convert master data 'created' column to datetime for date comparisons
    df['created'] = pd.to_datetime(df['created']).dt.tz_localize(None)
    
    # Filter records prior to 2016
    df = df[df['created'] > date(2006,1,1)]

    artist_dict = df_musicbrainz.groupby('artist_name')['release_date'].apply(list).to_dict()
    
    print("Computing Number of Days To Track Release...")
    df['num_days_to_track_release'] = df.apply(lambda x: find_min_diff(x['artist_name_y'],x['created'],artist_dict),axis=1)
    
    return df

In [15]:
df = createCombinedSocialMediaFile()

Starting the process to combine the social media data...
Reading master artists file...
Reading and cleaning Facebook Data...
Number of facebook records(311188, 39)
Reading and cleaning Instagram Data...
Number of instagram records(425000, 39)
Reading and cleaning Twitter Data...
Number of twitter records(324396, 39)
Reading and cleaning Youtube Data...
Number of youtube records(34536, 39)
Combining Social Media Data...
Number of records in dataframe :693237
Calculating Social Engagement Score for YouTube
200 has been completed!
400 has been completed!
600 has been completed!
800 has been completed!
1000 has been completed!
1200 has been completed!
1400 has been completed!
1600 has been completed!
1800 has been completed!
2000 has been completed!
2200 has been completed!
2400 has been completed!
2600 has been completed!
2800 has been completed!
3000 has been completed!
3200 has been completed!
3400 has been completed!
3600 has been completed!
3800 has been completed!
4000 has been comp

179400 has been completed!
179600 has been completed!
179800 has been completed!
180000 has been completed!
180200 has been completed!
180400 has been completed!
180600 has been completed!
180800 has been completed!
181000 has been completed!
181200 has been completed!
181400 has been completed!
181600 has been completed!
181800 has been completed!
182000 has been completed!
182200 has been completed!
182400 has been completed!
182600 has been completed!
182800 has been completed!
183000 has been completed!
183200 has been completed!
183400 has been completed!
183600 has been completed!
183800 has been completed!
184000 has been completed!
184200 has been completed!
184400 has been completed!
184600 has been completed!
184800 has been completed!
185000 has been completed!
185200 has been completed!
185400 has been completed!
185600 has been completed!
185800 has been completed!
186000 has been completed!
186200 has been completed!
186400 has been completed!
186600 has been completed!
1

378200 has been completed!
378400 has been completed!
378600 has been completed!
378800 has been completed!
379000 has been completed!
379200 has been completed!
379400 has been completed!
379600 has been completed!
379800 has been completed!
380000 has been completed!
380200 has been completed!
380400 has been completed!
380600 has been completed!
380800 has been completed!
381000 has been completed!
381200 has been completed!
381400 has been completed!
381600 has been completed!
381800 has been completed!
382000 has been completed!
382200 has been completed!
382400 has been completed!
382600 has been completed!
382800 has been completed!
383000 has been completed!
383200 has been completed!
383400 has been completed!
383600 has been completed!
383800 has been completed!
384000 has been completed!
384200 has been completed!
384400 has been completed!
384600 has been completed!
384800 has been completed!
385000 has been completed!
385200 has been completed!
385400 has been completed!
3

479000 has been completed!
479200 has been completed!
479400 has been completed!
479600 has been completed!
479800 has been completed!
480000 has been completed!
480200 has been completed!
480400 has been completed!
480600 has been completed!
480800 has been completed!
481000 has been completed!
481200 has been completed!
481400 has been completed!
481600 has been completed!
481800 has been completed!
482000 has been completed!
482200 has been completed!
482400 has been completed!
482600 has been completed!
482800 has been completed!
483000 has been completed!
483200 has been completed!
483400 has been completed!
483600 has been completed!
483800 has been completed!
484000 has been completed!
484200 has been completed!
484400 has been completed!
484600 has been completed!
484800 has been completed!
485000 has been completed!
485200 has been completed!
485400 has been completed!
485600 has been completed!
485800 has been completed!
486000 has been completed!
486200 has been completed!
4

674800 has been completed!
675000 has been completed!
675200 has been completed!
675400 has been completed!
675600 has been completed!
675800 has been completed!
676000 has been completed!
676200 has been completed!
676400 has been completed!
676600 has been completed!
676800 has been completed!
677000 has been completed!
677200 has been completed!
677400 has been completed!
677600 has been completed!
677800 has been completed!
678000 has been completed!
678200 has been completed!
678400 has been completed!
678600 has been completed!
678800 has been completed!
679000 has been completed!
679200 has been completed!
679400 has been completed!
679600 has been completed!
679800 has been completed!
680000 has been completed!
680200 has been completed!
680400 has been completed!
680600 has been completed!
680800 has been completed!
681000 has been completed!
681200 has been completed!
681400 has been completed!
681600 has been completed!
681800 has been completed!
682000 has been completed!
6

In [16]:
df.to_pickle('./Data/Output/Combined_Social_Data.pkl')