In [22]:
import os
import json
import datetime
from elasticsearch import Elasticsearch
from collections import defaultdict

def create_index_with_mapping(es, index_name):
    settings = {
        "settings": {
            "index": {
                "mapping": {
                    "nested_objects": {
                        "limit": 100000
                    }
                }
            }
        },
        "mappings": {
            "dynamic": "false",
            "properties": {
                "conversation_start": {
                    "type": "date"
                },
                "messages": {
                    "type": "nested",
                    "properties": {
                        "content": {
                            "type": "text",
                            "fields": {
                                "keyword": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        },
                        "timestamp_ms": {
                            "type": "long"
                        },
                        "sender_name": {
                            "type": "text",
                            "fields": {
                                "keyword": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                },
                "participants": {
                    "properties": {
                        "name": {
                            "type": "text",
                            "fields": {
                                "keyword": {
                                    "type": "keyword",
                                    "ignore_above": 256
                                }
                            }
                        }
                    }
                }
            }
        }
    }

    es.indices.create(index=index_name, body=settings)

def index_conversations(base_url, index_name, username, password, conversations):
    es = Elasticsearch([base_url], basic_auth=(username, password), verify_certs=False, ssl_show_warn=False)
    
    # Create index with mapping
    if not es.indices.exists(index=index_name):
        create_index_with_mapping(es, index_name)

    for index, (conversation_id, conversation_data) in enumerate(conversations.items()):
        
        # Initialize an empty dict to hold messages for each year
        yearwise_messages = defaultdict(list)

        for message in conversation_data["messages"]:
            timestamp_ms = message["timestamp_ms"]
            timestamp_dt = datetime.datetime.fromtimestamp(timestamp_ms / 1000.0)
            message_year = timestamp_dt.year
            yearwise_messages[message_year].append(message)

        for message_year, messages in yearwise_messages.items():
            doc_id = f"{index + 1}-{message_year}"
            
            conversation_start = min(message["timestamp_ms"] for message in messages)
            conversation_start_dt = datetime.datetime.fromtimestamp(conversation_start / 1000.0)
            conversation = {
                "participants": conversation_data["participants"],
                "messages": messages,
                "conversation_start": conversation_start_dt.isoformat()
            }

            index_data = {
                "index": index_name,
                "id": doc_id,
                "body": conversation
            }

            response = es.index(**index_data)

            if response["result"] == "created":
                print(f"Document indexed successfully: Conversation {doc_id}")
            else:
                print(f"Failed to index document: Conversation {doc_id}")
                print(response)


def main():
    index_name = "fb-msgs"
    base_url = "https://localhost:9200"
    username = "elastic"
    password = "changeme"
    json_directory = "../fb/fb-jsons"

    json_files = [os.path.join(json_directory, file) for file in os.listdir(json_directory) if file.endswith(".json")]

    conversations = {}

    for json_file in json_files:
        with open(json_file, "r") as file:
            data = json.load(file)

        if "participants" not in data:
            continue

        participants = data["participants"]
        messages = data["messages"]

        participant_names = [participant["name"] for participant in participants]
        conversation_id = ",".join(sorted(participant_names))

        if conversation_id in conversations:
            conversations[conversation_id]["messages"].extend(messages)
        else:
            conversations[conversation_id] = {
                "participants": participants,
                "messages": messages
            }

    index_conversations(base_url, index_name, username, password, conversations)

if __name__ == "__main__":
    main()


  es.indices.create(index=index_name, body=settings)
  response = es.index(**index_data)


Document indexed successfully: Conversation 1-2023
Document indexed successfully: Conversation 2-2021
Document indexed successfully: Conversation 3-2023
Document indexed successfully: Conversation 3-2022
Document indexed successfully: Conversation 3-2021
Document indexed successfully: Conversation 4-2017
Document indexed successfully: Conversation 4-2016
Document indexed successfully: Conversation 5-2021
Document indexed successfully: Conversation 6-2016
Document indexed successfully: Conversation 6-2014
Document indexed successfully: Conversation 7-2018
Document indexed successfully: Conversation 8-2023
Document indexed successfully: Conversation 9-2015
Document indexed successfully: Conversation 10-2023
Document indexed successfully: Conversation 11-2019
Document indexed successfully: Conversation 11-2017
Document indexed successfully: Conversation 11-2016
Document indexed successfully: Conversation 11-2015
Document indexed successfully: Conversation 11-2014
Document indexed successf