In [10]:
import os
import json
import pandas as pd
import emoji
from concurrent.futures import ThreadPoolExecutor, as_completed

In [11]:
# Specify the directory containing the JSON files
json_directory = 'C:\\Users\\djsma\\Downloads\\inbox1'

def process_json_files(json_files, folder_name):
    messages = []
    
    for file_path in sorted(json_files, key=lambda x: int(x.split('_')[-1].split('.')[0])):
        # Read the JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            messages.extend(data['messages'])
    
    # Check the number of unique senders
    senders = {msg.get('sender_name') for msg in messages if 'sender_name' in msg}
    if len(senders) != 2:
        return None
    
    # Create a DataFrame from the messages
    df = pd.DataFrame(messages)
    
    # Sort messages by timestamp in ascending order
    df.sort_values(by='timestamp_ms', inplace=True)
    
    # Function to process messages and replace sender name
    def process_message(row):
        content = row.get('content', "")
        return emoji.demojize(content)
    
    # Apply the process_message function to the 'content' field
    df['message'] = df.apply(
        lambda x: process_message(x) if 'content' in x and isinstance(x['content'], str) else "", axis=1
    )
    
    # Replace the specific sender name
    df['sender_name'] = df['sender_name'].replace(
        '𝙳𝚒𝚟𝚓𝚘𝚝 𝙼𝚊𝚗𝚌𝚑𝚊𝚗𝚍𝚊', 'Divjot'
    )
    
    # Add a column for share (yes/no)
    df['share'] = df.apply(
        lambda x: 'yes' if 'share' in x and isinstance(x['share'], dict) else 'no', axis=1
    )
    
    # Add a column for reactions
    df['reactions'] = df.apply(
        lambda x: emoji.demojize(x['reactions'][0]['reaction']) if 'reactions' in x and isinstance(x['reactions'], list) and len(x['reactions']) > 0 else pd.NA, axis=1
    )
    
    # Combine messages from the same sender within 5 minutes
    combined_messages = []
    last_sender = None
    last_timestamp = None
    combined_message = ""
    share = "no"
    reactions = pd.NA
    
    for index, row in df.iterrows():
        sender = row['sender_name']
        timestamp = row['timestamp_ms']
        
        if last_sender == sender and (timestamp - last_timestamp) <= 300000:  # 5 minutes in milliseconds
            combined_message += "\n\n" + row['message']
        else:
            if combined_message:
                if share == 'yes':
                    combined_message = f"replied to story/sent a reel: {combined_message.strip()}"
                combined_messages.append((last_sender, last_timestamp, combined_message.strip(), share, reactions))
            combined_message = row['message']
            last_sender = sender
            last_timestamp = timestamp
            share = row['share']
            reactions = row['reactions']
    
    if combined_message:
        if share == 'yes':
            combined_message = f"replied to story/sent a reel: {combined_message.strip()}"
        combined_messages.append((last_sender, last_timestamp, combined_message.strip(), share, reactions))
    
    # Create a new DataFrame with combined messages
    combined_df = pd.DataFrame(combined_messages, columns=['sender_name', 'timestamp_ms', 'message', 'share', 'reactions'])
    combined_df['message_id'] = range(1, len(combined_df) + 1)
    combined_df['folder_name'] = folder_name
    
    return combined_df

In [12]:
def process_folder(folder_name):
    account_dir = os.path.join(json_directory, folder_name)
    if os.path.isdir(account_dir):
        json_files = [os.path.join(account_dir, file) for file in os.listdir(account_dir) if file.startswith('message_') and file.endswith('.json')]
        if json_files:
            return process_json_files(json_files, folder_name)
    return None

In [15]:
# Get the list of all folders in the main directory
folders = [folder for folder in os.listdir(json_directory) if os.path.isdir(os.path.join(json_directory, folder))]

# Initialize an empty list to store the DataFrames
df_list = []

# Process each folder in parallel using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
    future_to_folder = {executor.submit(process_folder, folder): folder for folder in folders}
    for future in as_completed(future_to_folder):
        folder = future_to_folder[future]
        try:
            df = future.result()
            if df is not None:
                df_list.append(df)
        except Exception as e:
            print(f"Error processing folder {folder}: {e}")

In [None]:
# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)

# Select specific columns and rename them
final_df = final_df[['message_id', 'folder_name', 'sender_name', 'timestamp_ms', 'message', 'share', 'reactions']]
final_df.rename(columns={'timestamp_ms': 'timestamp'}, inplace=True)

# Save the DataFrame to a CSV file
output_file = 'instagram_messages.csv'
final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"DataFrame saved to {output_file}")


DataFrame saved to instagram_messages.csv
