## Import Statements

In [5]:
import glob
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

## Reading Data in

In [12]:
def time_of_day_creation(x):
    '''
    Takes in an integer and returns one of four time slots:
    ['12am-6am', '6am-12pm', '12pm-6pm', '6pm-12am']
    '''
    
    slots_out = ['created_12am-6am', 'created_6am-12pm', 'created_12pm-6pm', 'created_6pm-12am']
    slot0 = np.arange(0,6)
    slot1 = np.arange(6,12)
    slot2 = np.arange(12,18)
    slot3 = np.arange(18,24)

    if x in slot0:
        return slots_out[0]
    elif x in slot1:
        return slots_out[1]
    elif x in slot2:
        return slots_out[2]
    elif x in slot3:
        return slots_out[3]
    else:
        return 'NaN'

In [17]:
def calculateRollupPosts(social_media_df,social_media_type, social_media_handle_field, artists_list_dir):
    time_range = pd.date_range(start='1/1/2016', end='6/30/2019', freq='d').to_frame(index=False)
    time_range.rename(columns={0: 'Time Period'}, inplace=True)
    master_artists_list = pd.read_csv(artists_list_dir, usecols=[social_media_type])
    time_range['join_key'] = 0
    master_artists_list['join_key'] = 0
    artists_time_range = master_artists_list.merge(time_range, how='left', on = 'join_key')
    artists_time_range.drop('join_key',1, inplace=True)
    artists_time_range.rename(columns={0: 'Time Period'}, inplace=True)
    social_media_df['Created_Day'] = social_media_df.created.dt.floor('d')
    df_aggregate = social_media_df.groupby([social_media_handle_field, 'Created_Day'], as_index=False).agg({"created": "count"})
    df_aggregate.rename(columns={'created': 'Posts In Day'}, inplace=True)
    df_merged = artists_time_range.merge(df_aggregate, how='left', left_on=[social_media_type,'Time Period'], right_on=[social_media_handle_field,'Created_Day'])
    df_merged['Posts In Day'] = df_merged['Posts In Day'].fillna(0)
    df_merged['last_7_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(7, min_periods=1).sum())
    df_merged['last_30_days'] = df_merged[['Posts In Day']].groupby(df_merged[social_media_type]).apply(lambda g: g.rolling(30, min_periods=1).sum())
    df_merged.drop(columns=[social_media_handle_field, 'Created_Day','Posts In Day'], inplace=True)
    social_media_df = social_media_df.merge(df_merged, how='left', right_on=[social_media_type,'Time Period'], left_on=[social_media_handle_field,'Created_Day'])
    social_media_df.drop(columns=[social_media_type, 'Time Period','Created_Day'], inplace=True)
    
    return social_media_df

In [18]:
def read_and_clean_instagram(instagram_directory, artist_list_directory):
    '''
    Reads in all Instagram files from a directory of your choice (instagram_directory) into a pandas dataframe.
    Reads the master artist list in from the (artist_list_directory) of your choice.
    
    Filters the dataframe to necessary columns. 
    Converts columns into appropriate data types.
    Creates new custom columns based on current columns.
    Creates dummy features on categorical columns.
    
    Returns a dataframe that can then be exported, used for Exploratory Data Analysis, modeling, or 
    combination with other social media platform data that shares the same format (Twitter, Facebook, Youtube). 
    '''
    
    #Loads the Instagram files into a list of files and concatenates into a dataframe
    files = glob.glob(instagram_directory)
    df_list = [pd.read_csv(file) for file in files]
    df = pd.concat(df_list)
    
    #Field descriptions are available here: https://github.com/CrowdTangle/API/wiki/Post
    
    #Reduce the number of features in the dataframe
    selected_columns = ['User Name', 'Followers at Posting', 'Created', 'Type', 
                        'Likes', 'Comments', 'Views', 'Description','Score']
    df = df[selected_columns]
    
    #Convert the date of posting to datetime
    df['Created'] = pd.to_datetime(df['Created'])
    
    #Reset the index
    df.reset_index(inplace=True)
    
    #Convert Description to a string
    df['Description'] = df['Description'].astype(str)
    
    #Create custom feature that counts the number of characters in a post
    df['description_length'] = df['Description'].apply(len)
    
    #Create custom feature that counts the number of hashtags in a post
    df['hashtag_count'] = df['Description'].apply(lambda x : x.count('#'))
    
    #Creates a custom feature that calculates the Social Engagement Score for Instagram
    df['social_engagement_score'] = (df['Likes'] + df['Comments']) / df['Followers at Posting']
    
    #These are the three types of Instagram posts
    types = ['Photo', 'Video', 'Album']
    #Combine Photo and Album into one type
    df.loc[df.Type == 'Album', 'Type'] = "Photo"
    
    #Creates dummy features for the Type column
    df = pd.concat([df,pd.get_dummies(df['Type'],prefix='type')],axis=1)
    
    #Creating time of day buckets (e.g. 0-6am, 6-12pm, 12-6, 6-12am)
    df['time_of_day'] = df['Created'].apply(lambda x : time_of_day_creation(x.hour))
        
    #Creates dummy features for the time of day column
    df = pd.concat([df,pd.get_dummies(df['time_of_day'])],axis=1)
      
    #Creates new feature to calculate the time between this post and the previous post, by user
    df.sort_values(by=['User Name','Created'], inplace=True)
    df['time_since_last_post'] = df.groupby('User Name')['Created'].diff()
    
    #Dropping columns no longer required
    df.drop(['Likes','Comments','Views','time_of_day','index', 'Type'],axis=1,inplace=True)    
    
    #Rename the columns to align with format used in master dataframe (Youtube+Twitter+Instagram+Facebook)
    df.rename(columns={'Description':'description', 'Score': 'crowdtangle_score',
                           'Followers at Posting': 'count_of_followers', 'User Name':'artist_name',
                           'Created': 'created', 'type_Video': 'type_video','type_Photo':'type_photo'
                           },inplace=True)
    
    #Placeholder columns
    df['type_text'] = 0
    df['type_link'] = 0
    df['within_week_release'] = 0 
    df['within_month_release'] = 0 
    
    #Creates features to count the number of posts in last 7,30 days 
    df = calculateRollupPosts(df,'instagram','artist_name',artist_list_directory)
    
    #Drops artists that are not used in this version of music analysis
    artists_to_remove = ['carlosvives', 'mirandalambert', 'bradpaisley', 'thetimmcgraw',
                     'chrisyoungmusic', 'lukecombs', 'willienelsonofficial']
    df = df[~df['artist_name'].isin(artists_to_remove)]
    
    
    return df

In [20]:
df_out = read_and_clean_instagram('../InstagramData/*.csv', '../July8Data/master_artists_list.csv')

### @Gaurij, you only need to copy code from above for integration into the master file

In [26]:
df_out.head().T

Unnamed: 0,0,1,2,3,4
artist_name,21savage,21savage,21savage,21savage,21savage
count_of_followers,,,,,
created,2012-07-06 03:24:54,2012-08-04 05:05:59,2012-08-11 23:56:19,2012-08-11 23:58:53,2012-08-14 16:10:59
description,,,,Me n lil chuck,Free my right hand
crowdtangle_score,-254.65,-274.88,-405.73,-535.8,-572.5
description_length,3,3,3,14,18
hashtag_count,0,0,0,0,0
social_engagement_score,,,,,
type_photo,1,1,1,1,1
type_video,0,0,0,0,0


In [39]:
#Returns the file output as a pickle file.
df_out.to_pickle('instagram_7yrs_cleaned.pkl')

### Test Import of OLD pickle

In [29]:
df_in = pd.read_pickle('../July8Data/instagram_7yrs_cleaned.pkl')

In [30]:
df_in.head().T

Unnamed: 0,0,1,2,3,4
artist_name,21savage,21savage,21savage,21savage,21savage
count_of_followers,,,,,
created,2012-07-06 03:24:54,2012-08-04 05:05:59,2012-08-11 23:56:19,2012-08-11 23:58:53,2012-08-14 16:10:59
description,,,,Me n lil chuck,Free my right hand
crowdtangle_score,-254.65,-274.88,-405.73,-535.8,-572.5
description_length,3,3,3,14,18
hashtag_count,0,0,0,0,0
social_engagement_score,,,,,
type_photo,1,1,1,1,1
type_video,0,0,0,0,0
