In [6]:
import instaloader

import os
import json
import shutil
import lzma

import time
import numpy as np
import pandas as pd

download_dir = os.path.join("data",'raw')
os.makedirs(download_dir, exist_ok=True)

failed_posts = []

loader = instaloader.Instaloader()

# Download posts

In [4]:
xl_path = os.path.join('data','resources','database.xlsx')

already_downloaded = os.listdir(download_dir)

with pd.ExcelFile(xl_path) as xl:
    df = xl.parse() \
           .drop(columns=['id']) \
           .dropna(subset=['link']) \
           .eval("post_code = link.str.split('/').str[-2]") \
           .drop_duplicates(subset=['post_code']) \
           .rename(columns={'Objectify (yes or no)':'objectifies'})
    
pending = df.query("~post_code.isin(@already_downloaded)")
pending

# Check not to drop the row which contains objectify
# Check for missing values on objectify column

# df.query("post_code.isin(@already_downloaded)").post_code.to_clipboard(index=False, header=False)

Unnamed: 0,link,objectifies,post_code
88,https://www.instagram.com/p/C0UvM2UNU3w/?img_i...,0.0,C0UvM2UNU3w
89,https://www.instagram.com/p/C0H-DQlNFD8/?img_i...,1.0,C0H-DQlNFD8
90,https://www.instagram.com/p/C0H9dc8N_kZ/?img_i...,1.0,C0H9dc8N_kZ


In [5]:
for i,post_code in enumerate(pending.post_code.tolist()):
    
    print(f"{i+1} / {len(pending.post_code.tolist())}")
    try:    
        print(f"\tSTART: {pd.Timestamp.now():%H : %M : %S}")
        print('\t\t'+post_code)
        
        post = instaloader.Post.from_shortcode(loader.context, post_code)
        loader.download_post(post, target=post_code)

        print(f"\t\tEND: {pd.Timestamp.now():%H : %M : %S}")
        print(f"\n\nPost {post_code} downloaded successfully" + "\n" + "-"*100 + '\n'*2)
        shutil.move(post_code, download_dir)
        
    except instaloader.exceptions.InstaloaderException as e:
        print(f"Error: {e}")
        failed_posts.append((post_code,e))
        
    time.sleep(5)

1 / 3
	START: 06 : 39 : 11
		C0UvM2UNU3w
Error: Fetching Post metadata failed.
2 / 3
	START: 06 : 39 : 23
		C0H-DQlNFD8
Error: Fetching Post metadata failed.
3 / 3
	START: 06 : 39 : 29
		C0H9dc8N_kZ
Error: Fetching Post metadata failed.


In [None]:
# # Simple code to retrieve comments

# post_code = "CW6AeAVq3tJ"
# post = instaloader.Post.from_shortcode(loader.context, post_code)
# for comment in post.get_comments():
#     print(comment)

# Reading raw data

In [8]:
captions = []

for post_code in os.listdir(download_dir):
    
    post_data_path = os.path.join(download_dir,post_code)

    txt_file = [file for file in os.listdir(post_data_path) if file.endswith('.txt')][0]
    caption_path = os.path.join(post_data_path,txt_file)

    with open(caption_path,'r',encoding='utf-8') as f:
        caption = f.read().strip()
        
    captions.append((post_code, caption))
    
captions = pd.DataFrame(captions, columns=['post_code','caption'])

captions.head(5)

Unnamed: 0,post_code,caption
0,-PHbiexlr_,Stunning Mariana Rodriguez in #amencouture. Se...
1,3YO1o0Rlkt,@emastokholma in Amen embroidered #dress from ...
2,4yqgDyxloK,Stunning DJ @emastokholma wears all over paill...
3,B-y2Fo8qcN_,Crystal Pumps ✨ #AmenStyle #AmenPreSpring20 #A...
4,B3__d73i46x,Discover our FW19 Collection on amenstyle.com ...


# Reading metadata

In [9]:
additional_data = []

for i,post_code in enumerate(os.listdir(download_dir)):

    post_path = os.path.join(download_dir,post_code)

    metadata_path = os.path.join(post_path,
                                 [file for file in os.listdir(post_path) if file.endswith('.json.xz')][0])
    
    with lzma.open(metadata_path, 'rt') as file:
        data_dict = json.load(file)
        
    user = data_dict['node']['owner']['username']
    likes = data_dict['node']['edge_media_preview_like']['count']

    additional_data.append((post_code,user,likes))

additional_data = pd.DataFrame(additional_data, columns=['post_code','user','likes'])

additional_data.head(3)

Unnamed: 0,post_code,user,likes
0,-PHbiexlr_,amenstyle,106
1,3YO1o0Rlkt,amenstyle,684
2,4yqgDyxloK,amenstyle,368
