# IG360 QA - Basic Analysis
*Purpose:* Create report for Instagram user based on JSON data previously extracted:
<ul>
    <li>Top Likers</li>
    <li>Non-Reciprocating Accounts with few or no likes</li>
    <li>Hidden Gems to Follow</li>
</ul>
<br>
<br>


In [1]:
# import IG360 data
from IG360.ig360_scrape import *

# import other key libraries
import pandas as pd
import numpy as np
from datetime import datetime
import json
import re
from pathlib import Path

# set input / output folder
txt_dir_input = '_output/'
dir_input = Path(txt_dir_input)
txt_dir_output = '_output/'
dir_output = Path(txt_dir_output)

# locate configuration file
config_file_path = 'config.ini'
config = init_config(config_file_path)

# Reporting parameters
max_recs = 50

# input file: FP
src_fp_file = 'xx'

# input file: PS
src_ps_file = 'xx'


### Data Processing

Import JSON data

In [2]:
# initialize IG360Scrape object to use parsing features
analyzer = IG360Scrape(config)

# load full profile file
with open(dir_input/src_fp_file) as fp:
    raw_fp = json.load(fp)
    
# load full profile file
with open(dir_input/src_ps_file) as ps:
    raw_ps = json.load(ps)

# create data sets for posts and likes
df_posts = pd.DataFrame()
df_post_likes = pd.DataFrame()
for postdat in raw_ps['posts']:
    # convert date time field
    postdat[1] = datetime.strptime(postdat[1], '%Y-%m-%dT%H:%M:%S.%fZ')
    
    # basic post information
    addon = pd.DataFrame([postdat[:6]], columns=['picture_id', 'post_date', 'poster', 'location', 'likes', 'post'])
    df_posts = df_posts.append(addon, ignore_index=True)
    
    # like matrix
    for lk in postdat[6]:
        if len(lk) > 0:
            addonl = pd.DataFrame([[postdat[0], postdat[1], lk]], columns=['picture_id', 'post_date', 'account'])
            df_post_likes = df_post_likes.append(addonl, ignore_index=True)    

Generate details for post word counts, hashtags and comments

In [3]:

df_post_details = pd.DataFrame()
df_post_hashtags = pd.DataFrame()
df_post_comments = pd.DataFrame()

for tpost in raw_ps['posts']:
    # get post analytics
    tpost_det = analyzer.parse_post(tpost[5])
    
    # process post_details
    addon_det  = pd.DataFrame([[tpost[0]] + list(tpost_det[:3]) + [tpost_det[4]]], 
                              columns=['picture_id', 'post_num_words', 'post_num_nonhash', 'post_num_dist_nonhash', 'likes'])
    df_post_details = df_post_details.append(addon_det, ignore_index=True)
    
    # process post hashtags
    for hashtag in tpost_det[5]:
        addon_hashtag = pd.DataFrame([[tpost[0]] + [hashtag]], 
                                     columns=['picture_id', 'hashtag'])
        df_post_hashtags = df_post_hashtags.append(addon_hashtag, ignore_index=True)
    
    # process comment list
    for commenter in tpost_det[6]:
        addon_commenter = pd.DataFrame([[tpost[0]] + [commenter]], columns=['picture_id', 'account'])
        df_post_comments = df_post_comments.append(addon_commenter, ignore_index=True)



### Reporting

Top Likers

In [4]:
# aggregate likers
rept_top_likers = (df_post_likes.groupby(['account'])
                   .agg({'picture_id':'count', 'post_date':'max'})
                   .sort_values(['picture_id'], ascending=[False]))
rept_top_likers.columns = ['num_likes', 'last_post']

# add calcuation for days since last post
max_post = df_post_likes['post_date'].max()
rept_top_likers['days_snc_lst'] = ((max_post - rept_top_likers['last_post']) / np.timedelta64(1, 'D')).astype("int")

# display top likers
rept_top_likers[['num_likes', 'days_snc_lst']].head(max_recs)

Unnamed: 0_level_0,num_likes,days_snc_lst
account,Unnamed: 1_level_1,Unnamed: 2_level_1
juliekkchugh,3,0
ariellewind,3,0
derek.holman_,3,0
pookadoook,3,0
joshrusso5,3,0
northman.edc,3,0
tthorne22,3,0
lizziedianejohnson,3,0
kalarusso,3,0
air_morgan,3,0


Worst Non-Reciprocators (accounts that you follow but don't follow you back)

In [5]:
# get list of non-reciprocating accounts
df_non_recip =  pd.DataFrame(list(set(raw_fp['user_follows']) - set(raw_fp['followers'])),
                             columns = ['account']
                            )

# aggregate like stats
df_non_recip_tally = (df_post_likes[(df_post_likes['account'].isin(raw_fp['user_follows'])) & (~df_post_likes['account'].isin(raw_fp['followers']))]
                 .groupby(['account'])
                 .agg({'picture_id':'count', 'post_date':'max'})
                )
df_non_recip_tally.columns = ['num_likes', 'last_post']
max_post = df_non_recip_tally['last_post'].max()
df_non_recip_tally['days_snc_lst'] = ((max_post - df_non_recip_tally['last_post']) / np.timedelta64(1, 'D')).astype("int")

# combine and format
rept_worst_nc = (df_non_recip.merge(df_non_recip_tally, on='account', how='left')
                 .fillna({'num_likes':0})
                 .sort_values(['num_likes'], ascending=[True])
                 .head(max_recs)
                 .set_index('account')
                )[['num_likes', 'days_snc_lst']]
rept_worst_nc['num_likes'] = rept_worst_nc['num_likes'].astype("int")

#display worst non-reciprocators
rept_worst_nc

Unnamed: 0_level_0,num_likes,days_snc_lst
account,Unnamed: 1_level_1,Unnamed: 2_level_1
arze,0,


Hidden Gems (you are not following but provide a lot of likes)

In [6]:
# get list of accounts not followed by
df_nf =  pd.DataFrame(list(set(df_post_likes['account']) - set(raw_fp['user_follows'])),  columns = ['account'])

# aggregate like stats
df_nf_tally = (df_post_likes
                      .groupby(['account'])
                      .agg({'picture_id':'count', 'post_date':'max'})
                     )
df_nf_tally.columns = ['num_likes', 'last_post']
max_post = df_nf_tally['last_post'].max()
df_nf_tally['days_snc_lst'] = ((max_post - df_nf_tally['last_post']) / np.timedelta64(1, 'D')).astype("int")

# combine and format
rept_hidden_gems = (df_nf.merge(df_nf_tally, on='account', how='left')
                 .fillna({'num_likes':0})
                 .sort_values(['num_likes'], ascending=[False])
                 .head(max_recs)
                 .set_index('account')
                )[['num_likes', 'days_snc_lst']]
rept_hidden_gems['num_likes'] = rept_hidden_gems['num_likes'].astype("int")

#display worst non-reciprocators
rept_hidden_gems

Unnamed: 0_level_0,num_likes,days_snc_lst
account,Unnamed: 1_level_1,Unnamed: 2_level_1
kalarusso,3,0
ariellewind,3,0
joshrusso5,3,0
juliekkchugh,3,0
northman.edc,3,0
derek.holman_,3,0
tthorne22,3,0
air_morgan,3,0
pookadoook,3,0
lizziedianejohnson,3,0


Deadweight - People who follow but show little or no engagement

In [7]:
# get list of non-reciprocating accounts
df_deadweight =  pd.DataFrame(list(raw_fp['followers']), columns = ['account'])

# aggregate like stats
df_deadweight_tally = (df_post_likes.groupby(['account'])
                 .agg({'picture_id':'count', 'post_date':'max'})
                )
df_deadweight_tally.columns = ['num_likes', 'last_post']
max_post = df_deadweight_tally['last_post'].max()
df_deadweight_tally['days_snc_lst'] = ((max_post - df_deadweight_tally['last_post']) / np.timedelta64(1, 'D')).astype("int")

# combine and format
rept_deadweight = (df_deadweight.merge(df_deadweight_tally, on='account', how='left')
                 .fillna({'num_likes':0})
                 .sort_values(['num_likes'], ascending=[True])
                 .head(max_recs)
                 .set_index('account')
                )[['num_likes', 'days_snc_lst']]
rept_deadweight['num_likes'] = rept_deadweight['num_likes'].astype("int")

#display worst non-reciprocators
rept_deadweight

Unnamed: 0_level_0,num_likes,days_snc_lst
account,Unnamed: 1_level_1,Unnamed: 2_level_1
markbrignone,0,
national_park_time,0,
456rewqr,0,
msmcd3889,0,
morganvalleylamb,0,
mateorusso,0,
xonoxue,0,
ojisaputra5173,0,
sara27livorno,0,
devvela,0,


Prominent Hashtags

In [8]:

df_hashtag_tally = (df_post_hashtags.merge(df_post_details, on="picture_id", how="left")
                    .groupby(['hashtag'])
                    .agg({'picture_id':'count', 'post_num_nonhash':'mean', 'likes':'mean'})
                    .sort_values(['picture_id'], ascending=[False])
                    .head(max_recs)
                   )
df_hashtag_tally.columns = ['num_posts', 'avg_post_nonhash_len', 'avg_likes']
df_hashtag_tally

Unnamed: 0_level_0,num_posts,avg_post_nonhash_len,avg_likes
hashtag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
#benches,3,10.666667,24
#benchlife,3,10.666667,24
#trees,3,10.666667,24
#throwbackthursday,3,10.666667,24
#tbt,3,10.666667,24
#summernights,3,10.666667,24
#squat,3,10.666667,24
#sky,3,10.666667,24
#picoftheday,3,10.666667,24
#photooftheday,3,10.666667,24
