In [1]:
# this outputs the dataset as folders of text files, each folder corresponds to a category

In [1]:
import json
import gzip
from random import shuffle

def toFolders(source_field, start, end, minConf, min_text_len, dir_in, suffix, dir_out, instances_per_class):
    
    idx = 0;
    
    listEvents = []
    for year in range(start,end+1):
        filename='wiki-events-' + str(year) + suffix + '.json.gz'
        print("loading file " + filename + " ...")
        with gzip.open(os.path.join(dir_in, filename), "rb") as f:
            events = json.loads(f.read().decode("utf8"))
        print("found " + str(len(events['results'])) + " events...")
        listEvents = listEvents + events['results']
        print("total: " + str(len(listEvents)) + " events...")

    classes = {'armed conflicts and attacks':'armed conflicts and attacks', 
               'politics and elections':'politics and elections', 
               'law and crime':'law and crime', 'disasters and accidents':'disasters and accidents', 
               #'international relations':'international relations', 
               'sport':'sport', 'business and economy':'business and economy', 
               'arts and culture':'arts and culture', 'science and technology':'science and technology',
               'science': 'science and technology', 
               #'health and medicine': 'health and medicine',
               #'health and environmnet': 'health and medicine',
              }
    classInstances = {}

    shuffle(listEvents)
    
    for event in listEvents:
        #check is the keys are present
        if 'event-type' in event and 'full-text' in event:
            #keep only events with non empty full-text and event-type
            if event['full-text'] and event['event-type'] and len(event['full-text'])>min_text_len:
                label = event['event-type']
                if (label in classes):
                    entities = event['entities']
                    if source_field=='entities':
                        text = ''
                        for entity in entities:
                            avgconf = sum(float(i) for i in entity['confidence'])/len(entity['confidence'])
                            if avgconf > minConf:
                                text = text + ' ' + entity['label']
                    elif source_field=='short-text':
                        text = event['event']
                    elif source_field=='full-text':
                        text = event['full-text']
                    elif source_field=='entity-categories':
                        text = ''
                        for entity in entities:
                            avgconf = sum(float(i) for i in entity['confidence'])/len(entity['confidence'])
                            if avgconf > minConf:
                                text = text + ' ' + " ".join(entity['categories']).replace("_"," ")
                                    
                    if (not classes[label] in classInstances):
                        classInstances[classes[label]] = []
                    
                    if (len(classInstances[classes[label]]) >= instances_per_class):
                        continue
                        
                    classInstances[classes[label]].append(text)
                    

    print("Selected total events: " + str(idx))
    train_n = 0;
    test_n = 0;
    for cls in classInstances:
        insts = classInstances[cls]
        test_index = int(len(insts)*0.8)
        train = insts[:test_index]
        test = insts[test_index:]
        print(cls, len(insts))
    
        idx = 0;
        for text_tr in train:
            out_path = os.path.join(dir_out + "/train", cls)
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            with open(os.path.join(out_path, str(idx)), 'a') as the_file:
                the_file.write(text_tr)   
            idx += 1
            train_n += 1
        print("train: " + str(idx))
        idx = 0
        for text_tr in test:
            out_path = os.path.join(dir_out + "/test", cls)
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            with open(os.path.join(out_path, str(idx)), 'a') as the_file:
                the_file.write(text_tr)
            idx += 1
            test_n += 1
        print("test: " + str(idx))
    print("total train: " + str(train_n))
    print("total test: " + str(test_n))
        

In [2]:
import os
years_range = [2010,2017]
source_field = "full-text" #'entity-categories'
min_text_len = 50
instances_per_class = 10000 #1207
data_dir='../events-allignement/data/wikipedia-events-portal/clean/json'
#dir_out= 'Text_Classifier/.data/WE_clean_balanced_' + str(instances_per_class) + '/' + str(years_range[0]) + '-' + str(years_range[1]) + '-' + source_field
dir_out= 'Text_Classifier/.data/WE_clean_10/' + str(years_range[0]) + '-' + str(years_range[1]) + '-' + source_field
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
%time toFolders(source_field,years_range[0],years_range[1],0.6, 50, data_dir, "_multilink_data_clean", dir_out, instances_per_class)

loading file wiki-events-2010_multilink_data_clean.json.gz ...
found 12888 events...
total: 12888 events...
loading file wiki-events-2011_multilink_data_clean.json.gz ...
found 8997 events...
total: 21885 events...
loading file wiki-events-2012_multilink_data_clean.json.gz ...
found 6513 events...
total: 28398 events...
loading file wiki-events-2013_multilink_data_clean.json.gz ...
found 5858 events...
total: 34256 events...
loading file wiki-events-2014_multilink_data_clean.json.gz ...
found 3698 events...
total: 37954 events...
loading file wiki-events-2015_multilink_data_clean.json.gz ...
found 5921 events...
total: 43875 events...
loading file wiki-events-2016_multilink_data_clean.json.gz ...
found 6186 events...
total: 50061 events...
loading file wiki-events-2017_multilink_data_clean.json.gz ...
found 3212 events...
total: 53273 events...
Selected total events: 0
sport 2349
train: 1879
test: 470
politics and elections 5727
train: 4581
test: 1146
law and crime 5068
train: 4054
tes