In [1]:
import glob
import json

with open(glob.glob("_data/raw/instagram*/personal_information/personal_information.json")[0], "r") as f:
    username = json.load(f)['profile_user'][0]['string_map_data']['Username']['value']

In [2]:
file_paths = glob.glob('_data/raw/instagram*/messages/inbox/*/message_*.json')

In [3]:
import numpy as np

def update_content(row):
    if 'share' in row and row['share'] is not np.nan and 'link' in row['share'] and username in row['share']['link']:
        return f'Reacted to your story: {row['content']}'
    elif 'share' in row and row['share'] is not np.nan:
        if 'share_text' in row['share']:
            return f'Shared: {row['share']['share_text']}'
        elif 'link' in row['share']:
            return f'Shared a link: {row['share']['link']}'
        else:
            return 'Shared some content.'
    else:
        return row['content']

In [4]:
import pandas as pd

def process_file(file_path):
    with open(file_path) as f:
        data = json.load(f)

        user = data['participants'][1]['name']
        partner = data['participants'][0]['name']

        messages = pd.DataFrame(data['messages'])

        # if sender_name is user, replace with 'user'
        messages['sender_name'] = messages['sender_name'].apply(lambda x: 'user' if x == user else x)



        # convert 'timestamp_ms' to datetime
        messages['timestamp_ms'] = pd.to_datetime(messages['timestamp_ms'], unit='ms')

        
        if 'reactions' in messages:
            messages['reactions'] = messages['reactions'].apply(lambda x: ','.join([r['reaction'] for r in x]) if isinstance(x, list) else None)
        else:
            messages['reactions'] = None
        
        if 'content' in messages:
            messages['content'] = messages.apply(update_content, axis=1)
        else:
            messages['content'] = None

        # if there is an 'audio_files' column, replace 'content' with "audio"
        messages['content'] = messages.apply(
            lambda x: 'Sent an audio.' if ('audio_files' in x and x['audio_files'] is not np.nan) else str(x['content']), axis=1
        )



        # if there is a 'reactions' column, replace with a comma-separated list of reactions, as a new message with the same timestamp and opposite sender
        messages = pd.concat([messages, pd.DataFrame(
            messages[messages['reactions'].notnull()].apply(
                lambda x: pd.Series({
                    'sender_name': 'user' if x['sender_name'] == partner else partner,
                    'timestamp_ms': x['timestamp_ms'] + pd.Timedelta(3, unit='s')      , 
                    'content': x['reactions'] if 'reactions' in x else None,
                    'reactions': None
                }), axis=1
            )
        )])

        messages = messages[['sender_name', 'timestamp_ms', 'content']]

        messages['sender_name'] = messages['sender_name'].map(lambda x: x.encode('latin1').decode('utf8'))
        messages['content'] = messages['content'].map(lambda x: x.encode('latin1').decode('utf8'))


        # order by timestamp
        messages = messages.sort_values('timestamp_ms')

        # reset index
        messages = messages.reset_index(drop=True)

        # split timestamp into date and time
        messages['date'] = messages['timestamp_ms'].dt.date
        messages['time'] = messages['timestamp_ms'].dt.time
        messages['time'] = messages['time'].apply(lambda x: x.replace(microsecond=0))

        # drop timestamp
        messages = messages.drop('timestamp_ms', axis=1)

        # save to csv
        messages.to_csv('_data/parsed/facebook/conversations/{}.csv'.format(partner.encode('latin1').decode('utf8')), index=False)


In [5]:
from multiprocessing import Pool

with Pool(8) as p:
    p.map(process_file, file_paths)
    