In [7]:
import os
import json
import pandas as pd
import re
from unidecode import unidecode

# Specify the directory containing the JSON files
json_directory = 'Put_Inbox_Folder_Here\inbox1'

def normalize_text(text):
    # Remove non-ASCII characters and convert fancy characters to closest ASCII equivalents
    return unidecode(re.sub(r'[^\x00-\x7F]+', '', text))

def decode_unicode(obj):
    if isinstance(obj, str):
        return normalize_text(obj)
    return obj

def process_json_file(file_path, sender_name):
    # Read the JSON file
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file, object_hook=decode_unicode)

    # Extract the messages from the data
    messages = data['messages']

    # Create a DataFrame from the messages
    df = pd.DataFrame(messages)

    # Add a unique identifier column based on the index
    df.insert(0, 'message_id', range(1, len(df) + 1))

    # Add the sender name as a new column
    df['sender_name'] = sender_name

    # Ensure 'content' field exists in each message
    df['content'] = df.get('content', '')

    return df

# Initialize an empty list to store the DataFrames
df_list = []

# Iterate over all subdirectories in the main directory
for sender_name in os.listdir(json_directory):
    account_dir = os.path.join(json_directory, sender_name)
    if os.path.isdir(account_dir):
        json_file = os.path.join(account_dir, 'message_1.json')
        if os.path.isfile(json_file):
            df = process_json_file(json_file, sender_name)
            df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)

# Print the final DataFrame
print(final_df)


       message_id                    sender_name   timestamp_ms  \
0               1                737328824562114  1710997972815   
1               2                737328824562114  1710471602379   
2               3                737328824562114  1710471600116   
3               4                737328824562114  1710443199474   
4               5                737328824562114  1710443189883   
...           ...                            ...            ...   
35880          96  __swastikaaa__843675987123295  1691677251071   
35881          97  __swastikaaa__843675987123295  1691677241284   
35882          98  __swastikaaa__843675987123295  1691658147398   
35883          99  __swastikaaa__843675987123295  1691640924201   
35884         100  __swastikaaa__843675987123295  1691640739388   

                                         content  is_geoblocked_for_viewer  \
0       तुमच्या मॅसेज  ला 💗 अशी प्रतिक्रिया दिली                     False   
1                                    Lm

In [1]:
import os
import json
import pandas as pd
import emoji

# Specify the directory containing the JSON files
json_directory = 'Put_Inbox_Folder_Here\inbox'

def process_json_files(json_files, folder_name):
    messages = []
    
    for file_path in sorted(json_files, key=lambda x: int(x.split('_')[-1].split('.')[0])):
        # Read the JSON file
        with open(file_path, 'r', encoding='utf-8') as file:
            data = json.load(file)
            messages.extend(data['messages'])
    
    # Check the number of unique senders
    senders = {msg.get('sender_name') for msg in messages if 'sender_name' in msg}
    if len(senders) != 2:
        return None
    
    # Create a DataFrame from the messages
    df = pd.DataFrame(messages)
    
    # Sort messages by timestamp in ascending order
    df.sort_values(by='timestamp_ms', inplace=True)
    
    # Function to decode messages
    def decode_message(content):
        if pd.isna(content):
            return ""
        try:
            content = content.encode('latin1').decode('unicode-escape')
        except UnicodeDecodeError:
            content = content.encode('utf-8').decode('utf-8')
        return emoji.demojize(content)
    
    # Apply the decode_message function to the 'content' field
    df['message'] = df.apply(
        lambda x: decode_message(x['content']) if 'content' in x and isinstance(x['content'], str) else "", axis=1
    )
    
    # Add a column for share (yes/no)
    df['share'] = df.apply(
        lambda x: 'yes' if 'share' in x and isinstance(x['share'], dict) else 'no', axis=1
    )
    
    # Add a column for reactions
    df['reactions'] = df.apply(
        lambda x: emoji.demojize(x['reactions'][0]['reaction']) if 'reactions' in x and isinstance(x['reactions'], list) and len(x['reactions']) > 0 else pd.NA, axis=1
    )
    
    # Combine messages from the same sender within 2 minutes
    combined_messages = []
    last_sender = None
    last_timestamp = None
    combined_message = ""
    share = "no"
    reactions = pd.NA
    
    for index, row in df.iterrows():
        sender = row['sender_name']
        timestamp = row['timestamp_ms']
        
        if last_sender == sender and (timestamp - last_timestamp) <= 120000:
            combined_message += "\n\n" + row['message']
        else:
            if combined_message:
                combined_messages.append((last_sender, last_timestamp, combined_message.strip(), share, reactions))
            combined_message = row['message']
            last_sender = sender
            last_timestamp = timestamp
            share = row['share']
            reactions = row['reactions']
    
    if combined_message:
        combined_messages.append((last_sender, last_timestamp, combined_message.strip(), share, reactions))
    
    # Create a new DataFrame with combined messages
    combined_df = pd.DataFrame(combined_messages, columns=['sender_name', 'timestamp_ms', 'message', 'share', 'reactions'])
    combined_df['message_id'] = range(1, len(combined_df) + 1)
    combined_df['folder_name'] = folder_name
    
    return combined_df

# Initialize an empty list to store the DataFrames
df_list = []

# Iterate over all subdirectories in the main directory
for folder_name in os.listdir(json_directory):
    account_dir = os.path.join(json_directory, folder_name)
    if os.path.isdir(account_dir):
        json_files = [os.path.join(account_dir, file) for file in os.listdir(account_dir) if file.startswith('message_') and file.endswith('.json')]
        if json_files:
            df = process_json_files(json_files, folder_name)
            if df is not None:
                df_list.append(df)

# Concatenate all DataFrames into a single DataFrame
final_df = pd.concat(df_list, ignore_index=True)

# Select specific columns and rename them
final_df = final_df[['message_id', 'folder_name', 'sender_name', 'timestamp_ms', 'message', 'share', 'reactions']]
final_df.rename(columns={'timestamp_ms': 'timestamp'}, inplace=True)

# Save the DataFrame to a CSV file
output_file = 'instagram_messages1.csv'
final_df.to_csv(output_file, index=False, encoding='utf-8-sig')

print(f"DataFrame saved to {output_file}")


DataFrame saved to instagram_messages1.csv
