In [1]:
#Import statements
import json
import csv
import pandas as pd
import numpy as np
import re
from pathlib import Path
import gc

In [2]:
#Setting the path to fetch the data.  We assume that the JSON file we need is in a different folder laballed data
cwd = Path.cwd()
rel_path = '../data/'
mod_path = Path('Full_Dataset_Normalizer.ipynb').parent
src_path_1 = (mod_path / rel_path).resolve()

In [3]:
#Universal Variables
imagetags = set() #All image tags found by the API
hashtags = set() #All hashtags used by all the photos
imageobjects = set() #All the objects found by the API
imagecolors = set() #All the top 3 colors found by the API
fieldnames = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes']

In [4]:
def Column_Headers(imagetags, hashtags, imageobjects, imagecolors):
    with open(str(src_path_1)+'\\cortex-travel-industry-metadata-updated.json', mode = 'r', encoding = 'utf-8') as f:
        data = json.loads(f.read(), encoding = 'utf-8') #Load the data file
    #Get all the column header names from the data
    for photo in data: #Loop through every photo in the data set
        md = photo['metadata'] #The photo's metadata
        tags = md.get('imageTags', '') #The photo's image tags by the API
        ht = md.get('hashtags', '') #The photo's hash tags by the user
        objects = md.get('imageObjects','') #Objects in the photo found by the API
        colors = md.get('imageColors','') #The 3 main colors in the photo found by the API
        #Add the info found in the photo into the sets
        for tag in tags:
            imagetags.add(tag['value'])
        for tag in ht:
            hashtags.add(tag)
        for obj in objects:
            imageobjects.add(obj['name'])
        for color in colors:
            imagecolors.add(color['value'])
    return data

In [None]:
def Image_Tag_Data(fieldnames, data, imagetags):
    fn = fieldnames.copy() #Copy the common field names
    length = 8+len(imagetags) #Find the length of the image tag data frame
    #Add each tag into the fieldnames, make sure it is labelled as an image tag
    for tag in imagetags:
        fn.append(tag+'_tag')
    DF = pd.DataFrame() #The soon to be complete data set
    for photo in data: #Get the data out with the associated column name
        photodata = list(['0']*length) #blank row of data
        md = photo['metadata'] #The photo's metadata
        tags = md.get('imageTags','') #The photo's image tags by the API
        #Set the variables
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        i = 8
        for tag1 in imagetags:
            for tag2 in tags:
                if tag1 == tag2['value']:
                    photodata[i] = 1 #tag2['confidence'] We decided to use binary data instead of the confidence score
                    break
            i = i+1
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn) #turn our list into a dataframe
        DF = DF.append(df_temp, ignore_index = True) #Combine this temp data frame with our complete data set
    #Setting the type on the variables in the data frame
    DF1 = DF.iloc[:,0:8]
    DF2 = DF.iloc[:,8:]
    DF2 = DF2.astype(float)
    DF2 = DF2.round(decimals=0)
    DF2 = DF2.astype(int)
    sum1 = DF2.sum(axis=0)
    badCols = list()
    #Find the index of all the columns whose tag shows up less than 10 times
    for index in range(len(sum1)):
        if sum1[index]  < 10:
            badCols.append(index)
    #Remove the bad tags
    DF2 = DF2.drop(DF2.columns[badCols], axis=1)
    DF = pd.concat([DF1, DF2],axis=1)
    return DF

In [None]:
def Hashtag_Data(fieldnames, data, hashtags):
    #These next few functions all are copy and paste versions of the one above but for the other variables
    fn = fieldnames.copy()
    length = 8+len(hashtags)
    for tag in hashtags:
        fn.append(tag+'_ht')
    DF = pd.DataFrame()
    for photo in data: #Get the data out with the associated column name
        photodata = list(['0']*length)
        md = photo['metadata'] #The photo's metadata
        ht = md['hashtags']
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        i = 8
        for tag1 in hashtags:
            for tag2 in ht:
                if tag1 == tag2:
                    photodata[i] = 1
                    break
            i = i+1
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn)
        DF = DF.append(df_temp, ignore_index = True)
    DF1 = DF.iloc[:,0:8]
    DF2 = DF.iloc[:,8:]
    DF2 = DF2.astype(int)
    sum1 = DF2.sum(axis=0)
    badCols = list()
    for index in range(len(sum1)):
        if sum1[index]  < 10:
            badCols.append(index)
    DF2 = DF2.drop(DF2.columns[badCols], axis=1)
    DF = pd.concat([DF1, DF2],axis=1)
    return DF

In [None]:
def Image_Object_Data(fieldnames, data, imageobjects):    
    fn = fieldnames.copy()
    length = 8+len(imageobjects)
    for objects in imageobjects:
        fn.append(objects+'_obj')
    DF = pd.DataFrame()
    for photo in data: #Get the data out with the associated column name
        photodata = list(['0']*length)
        md = photo['metadata'] #The photo's metadata
        objects = md['imageObjects']
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        i = 8
        for object1 in imageobjects:
            for object2 in objects:
                if object1 == object2['name']:
                    photodata[i] = 1 #object2['conf']
                    break
            i = i+1
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn)
        DF = DF.append(df_temp, ignore_index = True)
    str_obj = DF.iloc[:,0:8] #creates dataset of just str data
    int_obj = DF.iloc[:,8:]
    int_obj = int_obj.astype(int) #creates dataset to convert datatypes to int
    sum1 = int_obj.sum(axis=0) #get sums of columns
    badCols = list()
    for index in range(len(sum1)):
        if sum1[index] < 5: #if object occurs less than 5 times, column name will be added to badCols list
            badCols.append(index)
    int_obj = int_obj.drop(int_obj.columns[badCols], axis=1) #names in badCols list will be dropped from original dataset
    DF= pd.concat([str_obj, int_obj], axis=1) #combines str & int datasets
    return DF

In [None]:
def Image_Color_Data(fieldnames, data, imagecolors):
    fn = fieldnames.copy()
    length = 8+len(imagecolors)
    for color in imagecolors:
        fn.append(color)
    DF = pd.DataFrame()
    for photo in data: #Get the data out with the associated column name
        photodata = list(['0']*length)
        md = photo['metadata'] #The photo's metadata
        colors = md['imageColors']
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        i = 8
        for color1 in imagecolors:
            for color2 in colors:
                if color1 == color2['value']:
                    photodata[i] = color2['confidence']
                    break
            i = i+1
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn)
        DF = DF.append(df_temp, ignore_index = True)
    DF.columns = map(str.upper, DF.columns) #Capitalize all column names
    #Here we are taking all the top 3 colors found by the API and mapping them to more general colors
    #So instead of having both Maroon and Scarlet in the data set, we would label them both as Red
    #This Color Group csv is in a separate file that needs to be read in.
    with open(str(src_path_1)+'\\Color Group.csv', mode='r') as infile:
        reader = csv.reader(infile)
        mydict = {rows[0]:rows[1] for rows in reader}
    DF=DF.rename(index=str,columns=mydict) #Rename all the columns in the data frame.  Now we could have multiple columns labelled Green
    #create a list of color groups
    color=['Pink','Purple','Red','Orange','Yellow','Green','Cyan','Blue','Brown','White','Grey','Black']
    #change value tpe to float
    for c in color:
        DF[c]=DF[c].astype('float')
    df=DF.transpose().reset_index().rename(columns={'index':'Color'}) #transpose dataset 
    df = df.groupby('Color').sum() #Group same colors together and sum their values together
    DF=df.T #transpose back
    cols = ['PageName', 'PostID', 'PostTime','Height','Width','Followers', 'Comments', 'Likes', 'Pink','Purple','Red','Orange','Yellow','Green','Cyan','Blue','Brown','White','Grey','Black']
    DF = DF.reindex(columns=cols)
    return DF

In [5]:
def sylco(word):  #Syllable counter given only the spelling of a word
    #This is code I pulled from https://eayd.in/?p=232
    #There really are no good syllable counters for the english language based solely on the spelling of the words
    #This code combines the 14 most common syllable counting rules we know to get the best prediction for the number
    #of syllables a word would have
    word = word.lower()

    # exception_add are words that need extra syllables
    # exception_del are words that need less syllables

    exception_add = ['serious','crucial']
    exception_del = ['fortunately','unfortunately', 'facebook']

    co_one = ['cool','coach','coat','coal','count','coin','coarse','coup','coif','cook','coign','coiffe','coof','court']
    co_two = ['coapt','coed','coinci']

    pre_one = ['preach']

    syls = 0 #added syllable number
    disc = 0 #discarded syllable number

    #1) if letters < 3 : return 1
    if len(word) <= 3 :
        syls = 1
        return syls

    #2) if doesn't end with "ted" or "tes" or "ses" or "ied" or "ies", discard "es" and "ed" at the end.
    # if it has only 1 vowel or 1 set of consecutive vowels, discard. (like "speed", "fled" etc.)

    if word[-2:] == "es" or word[-2:] == "ed" :
        doubleAndtripple_1 = len(re.findall(r'[eaoui][eaoui]',word))
        if doubleAndtripple_1 > 1 or len(re.findall(r'[eaoui][^eaoui]',word)) > 1 :
            if word[-3:] == "ted" or word[-3:] == "tes" or word[-3:] == "ses" or word[-3:] == "ied" or word[-3:] == "ies" :
                pass
            else :
                disc+=1

    #3) discard trailing "e", except where ending is "le"  

    le_except = ['whole','mobile','pole','male','female','hale','pale','tale','sale','aisle','whale','while']

    if word[-1:] == "e" :
        if word[-2:] == "le" and word not in le_except :
            pass

        else :
            disc+=1

    #4) check if consecutive vowels exists, triplets or pairs, count them as one.

    doubleAndtripple = len(re.findall(r'[eaoui][eaoui]',word))
    tripple = len(re.findall(r'[eaoui][eaoui][eaoui]',word))
    disc+=doubleAndtripple + tripple

    #5) count remaining vowels in word.
    numVowels = len(re.findall(r'[eaoui]',word))

    #6) add one if starts with "mc"
    if word[:2] == "mc" :
        syls+=1

    #7) add one if ends with "y" but is not surrouned by vowel
    if word[-1:] == "y" and word[-2] not in "aeoui" :
        syls +=1

    #8) add one if "y" is surrounded by non-vowels and is not in the last word.

    for i,j in enumerate(word) :
        if j == "y" :
            if (i != 0) and (i != len(word)-1) :
                if word[i-1] not in "aeoui" and word[i+1] not in "aeoui" :
                    syls+=1

    #9) if starts with "tri-" or "bi-" and is followed by a vowel, add one.

    if word[:3] == "tri" and word[3] in "aeoui" :
        syls+=1

    if word[:2] == "bi" and word[2] in "aeoui" :
        syls+=1

    #10) if ends with "-ian", should be counted as two syllables, except for "-tian" and "-cian"

    if word[-3:] == "ian" : 
    #and (word[-4:] != "cian" or word[-4:] != "tian") :
        if word[-4:] == "cian" or word[-4:] == "tian" :
            pass
        else :
            syls+=1

    #11) if starts with "co-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:2] == "co" and word[2] in 'eaoui' :

        if word[:4] in co_two or word[:5] in co_two or word[:6] in co_two :
            syls+=1
        elif word[:4] in co_one or word[:5] in co_one or word[:6] in co_one :
            pass
        else :
            syls+=1

    #12) if starts with "pre-" and is followed by a vowel, check if exists in the double syllable dictionary, if not, check if in single dictionary and act accordingly.

    if word[:3] == "pre" and word[3] in 'eaoui' :
        if word[:6] in pre_one :
            pass
        else :
            syls+=1

    #13) check for "-n't" and cross match with dictionary to add syllable.

    negative = ["doesn't", "isn't", "shouldn't", "couldn't","wouldn't"]

    if word[-3:] == "n't" :
        if word in negative :
            syls+=1
        else :
            pass   

    #14) Handling the exceptional words.

    if word in exception_del :
        disc+=1

    if word in exception_add :
        syls+=1     

    # calculate the output
    return numVowels - disc + syls

In [6]:
def Readability_Data(fieldnames, data):
    #Similar start to before by making a fieldname list and an empty data frame
    fn = fieldnames.copy()
    length = 9
    fn.append('Flesch RE')
    DF = pd.DataFrame()
    for photo in data: #Get the data out with the associated column name
        text = photo['rawText'].replace('\u2063','') #For each raw caption remove the hidden character that appears whenever you
        #use a period
        text = text.lower() #Make all the words lower case, so that we can better cleanse it
        text = re.sub(r'[^a-z\s#@]','',text) #Remove everything that isn't a letter or a # or @
        words = text.split() #Make a list of all the words
        i = 0
        while i < len(words):
            if words[i][0] == '#' or words[i][0] == '@': #Remove # and @ words. It's really hard to count syllables on these words
                words.pop(i)
                i = i - 1
            i = i + 1
        photodata = list(['0']*length)
        md = photo['metadata'] #The photo's metadata
        syllb = 0.0 #The reading score is a float but everything in the formula is an int, so make syllb a float to get the
                    #Correct answer
        row = np.array([[0,0,0,0,0]])
        for word in words:
            syllb += sylco(word)
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        #If there is no caption, place a NULL there instead
        if len(words) == 0:
            photodata[8] = np.nan
        else:
            photodata[8] = 206.835-1.015*len(words)-84.6*(syllb/len(words))
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn)
        DF = DF.append(df_temp, ignore_index = True)
    DF['Flesch RE'] = DF['Flesch RE'].astype(float)
    return DF

In [None]:
def PHash_Data(fieldnames, data):
    fn = fieldnames.copy()
    length = 9
    fn.append('PHash')
    DF = pd.DataFrame()
    for photo in data: #Get the data out with the associated column name
        photodata = list(['0']*length)
        md = photo['metadata'] #The photo's metadata
        photodata[0] = photo['pageName']
        photodata[1] = photo['extPostId']
        photodata[2] = photo['extCreatedAt']
        photodata[3] = md['imgHeight']
        photodata[4] = md['imgWidth']
        photodata[5] = photo['nFollowers']
        photodata[6] = photo['nComments']
        photodata[7] = photo['nLikes']
        photodata[8] = md['imagePHash']
        df_temp = pd.DataFrame(data = np.array(photodata).reshape((1,len(fn))), columns = fn)
        DF = DF.append(df_temp, ignore_index = True)
    return DF

In [None]:
def DateTime_Parts(data): #Creates a bunch of different date times in boolean (0/1) form. Only read in Final data with a "PostTime" variable
    data['date_series'] = pd.to_datetime(data['PostTime'])
    data['yearofpost'] = data['date_series'].dt.year
    data['monthofpost'] = data['date_series'].dt.month
    data['hourofpost'] = data['date_series'].dt.hour
    data['minuteofpost'] = data['date_series'].dt.minute
    data['quarterofpost'] = data['date_series'].dt.quarter
    data['JanuaryPost'] = np.where(data['monthofpost'] == 1, 1,0)
    data['FebuaryPost'] = np.where(data['monthofpost'] == 2, 1,0)
    data['MarchPost'] = np.where(data['monthofpost'] == 3, 1,0)
    data['AprilPost'] = np.where(data['monthofpost'] == 4, 1,0)
    data['MayPost'] = np.where(data['monthofpost'] == 5, 1,0)
    data['JunePost'] = np.where(data['monthofpost'] == 6, 1,0)
    data['JulyPost'] = np.where(data['monthofpost'] == 7, 1,0)
    data['AugustPost'] = np.where(data['monthofpost'] == 8, 1,0)
    data['SeptemberPost'] = np.where(data['monthofpost'] == 9, 1,0)
    data['OctoberPost'] = np.where(data['monthofpost'] == 10, 1,0)
    data['NovemberPost'] = np.where(data['monthofpost'] == 11, 1,0)
    data['DecemberPost'] = np.where(data['monthofpost'] == 12, 1,0)
    conditions = [
        (data['hourofpost'] <= 5) | (data['hourofpost'] > 22),
        (data['hourofpost'] > 5) & (data['hourofpost'] <= 12),
        (data['hourofpost'] > 12) & (data['hourofpost'] <= 17),
        (data['hourofpost'] > 17) & (data['hourofpost'] <= 22)]
    choices = ['11pm to 5am', '6am to 12pm', '1pm to 5pm', '6pm to 10pm']
    data['TimeofDay'] = np.select(conditions, choices, default='N/A')
    data['11pm to 5am'] = np.where(data['TimeofDay']=='11pm to 5am',1,0)
    data['6am to 12pm'] = np.where(data['TimeofDay']=='6am to 12pm',1,0)
    data['1pm to 5pm'] = np.where(data['TimeofDay']=='1pm to 5pmm',1,0)
    data['6pm to 10pm'] = np.where(data['TimeofDay']=='6pm to 10pm',1,0)
    data = data.drop(columns = ['TimeofDay'])

In [7]:
def SchoolPart(df):
    conditions = [
        df['Flesch RE'] >= 100,
        (df['Flesch RE'] < 100) & (df['Flesch RE'] >= 90),
        (df['Flesch RE'] < 90) & (df['Flesch RE'] >= 80),
        (df['Flesch RE'] < 80) & (df['Flesch RE'] >= 70),
        (df['Flesch RE'] < 70) & (df['Flesch RE'] >= 60),
        (df['Flesch RE'] < 60) & (df['Flesch RE'] >= 50),
        (df['Flesch RE'] < 50) & (df['Flesch RE'] >= 30),
        (df['Flesch RE'] < 30) & (df['Flesch RE'] >= 0),
        df['Flesch RE'] < 0]
    choices = ['4th or Below', '5th', '6th', '7th', '8th or 9th', '10th - 12th', 'College', 'College Graduate', 'Post Graduate Studies']
    df['SchoolYearRead'] = np.select(conditions, choices, default='NULL')
    conditions = [
        df['Flesch RE'] >= 90,
        (df['Flesch RE'] < 90) & (df['Flesch RE'] >= 60),
        (df['Flesch RE'] < 60) & (df['Flesch RE'] >= 50),
        (df['Flesch RE'] < 50) & (df['Flesch RE'] >= 30),
        (df['Flesch RE'] < 30) & (df['Flesch RE'] >= 0),
        df['Flesch RE'] < 0]
    choices = ['Elementary and Below', 'Middle', 'High', 'College', 'College Grad', 'Post Grad']
    df['SchoolGroupRead'] = np.select(conditions, choices, default='NULL')

In [None]:
photos = Column_Headers(imagetags, hashtags, imageobjects, imagecolors)
get_tag = Image_Tag_Data(fieldnames, photos, imagetags)
get_hash = Hashtag_Data(fieldnames, photos, hashtags)
DFmerge1 = pd.merge(get_tag, get_hash, how = 'left', on = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes'])
del [[get_tag,get_hash]]
gc.collect()
get_obj = Image_Object_Data(fieldnames, photos, imageobjects)
DFmerge2 = pd.merge(DFmerge1, get_obj, how = 'left', on = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes'])
del [[get_obj,DFmerge1]]
gc.collect()
get_color = Image_Color_Data(fieldnames, photos, imagecolors)
DFmerge3 = pd.merge(DFmerge2, get_color, how = 'left', on = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes'])
del [[get_color,DFmerge2]]
gc.collect()
get_readability = Readability_Data(fieldnames, photos)
DFmerge4 = pd.merge(DFmerge3, get_readability, how = 'left', on = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes'])
del [[get_readability,DFmerge3]]
gc.collect()
get_phash = PHash_Data(fieldnames, photos)
DFfull = pd.merge(DFmerge4, get_phash, how = 'left', on = ['PageName', 'PostID', 'PostTime', 'Height', 'Width', 'Followers', 'Comments', 'Likes'])
del [[get_phash,DFmerge4]]
gc.collect()

In [None]:
DFfull['Followers'] = pd.to_numeric(DFfull['Followers'])
DFfull['Likes'] = pd.to_numeric(DFfull['Likes'])
DFfull['Comments'] = pd.to_numeric(DFfull['Comments'])
DFfull['Engagement_Rate'] = ((DFfull['Comments'] + DFfull['Likes']) / DFfull['Followers']) * 100
DateTime_Parts(DFfull)
SchoolPart(DFfull)
DFfull.to_pickle('Full_Cortex_Dataset.pkl')