In [3]:
# !pip install spacy

In [80]:
import spacy
import datetime
import random
import string
from dateutil import parser
import json

In [81]:
from tqdm import tqdm

In [82]:
with open('atlas-chat.json', 'r') as f:
  data = json.load(f)
data = data['messages']

In [83]:
 # !python -m spacy download en_core_web_sm

In [84]:
len(data)

131

In [10]:
# Step 3: Load the spaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
    # PERSON - People, including fictional.
    # NORP - Nationalities or religious or political groups.
    # FAC - Buildings, airports, highways, bridges, etc.
    # ORG - Companies, agencies, institutions, etc.
    # GPE - Countries, cities, states.

In [72]:
nerLookupStr = '''PERSON:      People, including fictional.
NORP:        Nationalities or religious or political groups.
FAC:         Buildings, airports, highways, bridges, etc.
ORG:         Companies, agencies, institutions, etc.
GPE:         Countries, cities, states.
LOC:         Non-GPE locations, mountain ranges, bodies of water.
PRODUCT:     Objects, vehicles, foods, etc. (Not services.)
EVENT:       Named hurricanes, battles, wars, sports events, etc.
WORK_OF_ART: Titles of books, songs, etc.
LAW:         Named documents made into laws.
LANGUAGE:    Any named language.
DATE:        Absolute or relative dates or periods.
TIME:        Times smaller than a day.
PERCENT:     Percentage, including ”%“.
MONEY:       Monetary values, including unit.
QUANTITY:    Measurements, as of weight or distance.
ORDINAL:     “first”, “second”, etc.
CARDINAL:    Numerals that do not fall under another type.'''
nerLookup = dict()
for line in nerLookupStr.split('\n'):
    items = [a.strip() for a in line.split(':')]
    nerLookup[items[0].lower()] = items[1]

In [73]:
nerLookup

{'person': 'People, including fictional.',
 'norp': 'Nationalities or religious or political groups.',
 'fac': 'Buildings, airports, highways, bridges, etc.',
 'org': 'Companies, agencies, institutions, etc.',
 'gpe': 'Countries, cities, states.',
 'loc': 'Non-GPE locations, mountain ranges, bodies of water.',
 'product': 'Objects, vehicles, foods, etc. (Not services.)',
 'event': 'Named hurricanes, battles, wars, sports events, etc.',
 'work_of_art': 'Titles of books, songs, etc.',
 'law': 'Named documents made into laws.',
 'language': 'Any named language.',
 'date': 'Absolute or relative dates or periods.',
 'time': 'Times smaller than a day.',
 'percent': 'Percentage, including ”%“.',
 'money': 'Monetary values, including unit.',
 'quantity': 'Measurements, as of weight or distance.',
 'ordinal': '“first”, “second”, etc.',
 'cardinal': 'Numerals that do not fall under another type.'}

In [88]:
# Step 5: Define a function to extract named entities from a given text
def extract_named_entities(text):
    doc = nlp(text)
    entities = []
    for entity in doc.ents:
        entities.append((entity.text, entity.label_))
    return entities

# Step 7: Define a function to generate entries with the desired format using spaCy named entities and word frequency
def generate_entries(message_dataset):
    entries = []

    for week_start, week_end in tqdm(get_weeks_range(message_dataset)):
        
        week_entries = {
            "date": week_start.isoformat(),
            "words": {
            }
        }
        ners = ['ORG', 'CARDINAL', 'DATE', 'GPE', 'PERSON', 'MONEY', 'PRODUCT', 'TIME', 'PERCENT', 'WORK_OF_ART', 'QUANTITY', 'NORP', 'LOC', 'EVENT', 'ORDINAL', 'FAC', 'LAW', 'LANGUAGE']

        for ner in ners:
            if ner.lower() in ['time','percent','quantity','date','cardinal','ordinal','money']:
                continue

            week_entries['words'].setdefault(nerLookup[ner.lower()], [])

        word_counts = {}

        for message in message_dataset:
            message_date = message['timestamp'].date()

            if week_start <= message_date <= week_end:
                text = message['text']
                entities = extract_named_entities(text)

                for entity, label in entities:
                    topic = label.lower()
                    if topic in ['time','percent','quantity','date','cardinal','ordinal','money']:
                        continue
                    topic = nerLookup[topic]
                    
                    word = entity.lower()
                    if word.isnumeric() or '#' in word:
                        continue

                    if word in word_counts:
                        word_counts[word] += 1
                    else:
                        word_counts[word] = 1

                    entry_id = f"{word}_{topic}_{word_counts[word]}"
                    entry = {
                        "text": word,
                        "frequency": word_counts[word],
                        "topic": topic,
                        "id": entry_id
                    }

                    week_entries["words"][topic].append(entry)

        entries.append(week_entries)

    return entries

# Step 8: Define a function to get the range of weeks in the message dataset
def get_weeks_range(message_dataset):
    message_dates = [parser.parse(message['timestamp'].isoformat()).date() for message in message_dataset]
    min_date = min(message_dates)
    max_date = max(message_dates)

    start_week = min_date.isocalendar()[1]
    end_week = max_date.isocalendar()[1]

    weeks_range = []

    for week_number in range(start_week, end_week + 1):
        week_start = min_date + datetime.timedelta(weeks=(week_number - start_week))
        week_end = week_start + datetime.timedelta(days=6)
        weeks_range.append((week_start, week_end))

    return weeks_range



In [89]:
# Step 9: Provide your message dataset in the form of a list of dictionaries, where each dictionary represents a message and contains the timestamp and text content.

# Sample message dataset
message_dataset = []
for d in data:
    message_dataset.append({
        'timestamp' : parser.parse(d['timestamp']),
        'text' : d['content']
    })

random.choice(message_dataset)

{'timestamp': datetime.datetime(2023, 4, 23, 16, 30, 54, 524000, tzinfo=tzutc()),
 'text': "you can do\n```\nimport nomic\nnomic.login('yourtoken')\n```"}

In [90]:
entries = generate_entries(message_dataset)

len(entries)

100%|█████████████████████████████████████████████| 8/8 [00:03<00:00,  2.61it/s]


8

In [91]:
# Write to JSON:
with open('atlas_data.json', 'w') as f:
    json.dump(entries, f)