In [1]:
import json
import os

In [6]:
with open('guids.json', 'r') as file:
    guids_data = json.load(file)
valid_guids = set(guids_data['guids'])

filtered_events = []

for year in range(2015, 2022):
    filename = f'{year}_full.json'
    if os.path.exists(filename):
        with open(filename, 'r') as file:
            data_year = json.load(file)
        
        # Process each event in the current year's file
        for event in data_year['events']:
            guid = event['attribute']['guid']
            if guid in valid_guids:
                filtered_event = {
                    guid: {
                        'sentiment_polarity': event['attribute']['sentiment_polarity'],
                        'sentiment_subjectivity': event['attribute']['sentiment_subjectivity'],
                        'avg_sentence_length': event['attribute']['avg_sentence_length'],
                        'named_entities': event['attribute']['named_entities'],
                        'topic': event['attribute']['topic']
                    }
                }
                filtered_events.append(filtered_event)
    else:
        print(f"File not found: {filename}")

output_data = {'articles': filtered_events}

with open('filtered_events.json', 'w') as file:
    json.dump(output_data, file)

filtered_events = filtered_events
    
# Split the data into many json files to avoid memory issues on the Lambda function
step = 3000

for i in range(0, len(filtered_events), step):
    data_items = filtered_events[i:i+step]
    # Making guid the key
    data_items = {list(item.keys())[0]: list(item.values())[0] for item in data_items}
    output_data = {'articles': data_items}
    with open(f'results/filtered_events_{i//step}.json', 'w') as file:
        json.dump(output_data, file)
