In [3]:
# !pip install spacy

In [4]:
import spacy
import datetime
import random
import string
from dateutil import parser
import json

In [57]:
from tqdm import tqdm

In [7]:
with open('gpt4all-chat.json', 'r') as f:
  data = json.load(f)
data = data['messages']

In [9]:
 !python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.5.0/en_core_web_sm-3.5.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.5.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [10]:
# Step 3: Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
    # PERSON - People, including fictional.
    # NORP - Nationalities or religious or political groups.
    # FAC - Buildings, airports, highways, bridges, etc.
    # ORG - Companies, agencies, institutions, etc.
    # GPE - Countries, cities, states.

In [63]:
# Step 5: Define a function to extract named entities from a given text
def extract_named_entities(text):
    doc = nlp(text)
    entities = []
    for entity in doc.ents:
        entities.append((entity.text, entity.label_))
    return entities

# Step 7: Define a function to generate entries with the desired format using spaCy named entities and word frequency
def generate_entries(message_dataset):
    entries = []

    for week_start, week_end in tqdm(get_weeks_range(message_dataset)):
        
        week_entries = {
            "date": week_start.isoformat(),
            "words": {
            }
        }
        ners = ['ORG', 'CARDINAL', 'DATE', 'GPE', 'PERSON', 'MONEY', 'PRODUCT', 'TIME', 'PERCENT', 'WORK_OF_ART', 'QUANTITY', 'NORP', 'LOC', 'EVENT', 'ORDINAL', 'FAC', 'LAW', 'LANGUAGE']
        for ner in ners:
            week_entries['words'].setdefault(ner.lower(), [])

        word_counts = {}

        for message in message_dataset:
            message_date = message['timestamp'].date()

            if week_start <= message_date <= week_end:
                text = message['text']
                entities = extract_named_entities(text)

                for entity, label in entities:
                    topic = label.lower()
                    word = entity.lower()

                    if word in word_counts:
                        word_counts[word] += 1
                    else:
                        word_counts[word] = 1

                    entry_id = f"{word}_{topic}_{word_counts[word]}"
                    entry = {
                        "text": word,
                        "frequency": word_counts[word],
                        "topic": topic,
                        "id": entry_id
                    }

                    week_entries["words"][topic].append(entry)

        entries.append(week_entries)

    return entries

# Step 8: Define a function to get the range of weeks in the message dataset
def get_weeks_range(message_dataset):
    message_dates = [parser.parse(message['timestamp'].isoformat()).date() for message in message_dataset]
    min_date = min(message_dates)
    max_date = max(message_dates)

    start_week = min_date.isocalendar()[1]
    end_week = max_date.isocalendar()[1]

    weeks_range = []

    for week_number in range(start_week, end_week + 1):
        week_start = min_date + datetime.timedelta(weeks=(week_number - start_week))
        week_end = week_start + datetime.timedelta(days=6)
        weeks_range.append((week_start, week_end))

    return weeks_range



In [64]:
# # Step 9: Provide your message dataset in the form of a list of dictionaries, where each dictionary represents a message and contains the timestamp and text content.

# # Sample message dataset
# message_dataset = []
# for d in data:
#     message_dataset.append({
#         'timestamp' : parser.parse(d['timestamp']),
#         'text' : d['content']
#     })

# random.choice(message_dataset)

In [65]:
entries = generate_entries(message_dataset)

len(entries)

100%|█████████████████████████████████████████████| 8/8 [05:14<00:00, 39.31s/it]


8

In [67]:
# Write to JSON:
with open('new_data.json', 'w') as f:
    json.dump(entries, f)