In [10]:
# this outputs the dataset as folders of text files, each folder corresponds to a category

In [50]:
import json
import gzip
import random
from random import shuffle
import data_utils.WE_2 as WE_2

def k_fold_split(listEvents, classes, num_folders, path, test_train_ratio, writeIndexes, writeJSON, GZIP):
        
    classInstances = {}
    index = range(len(listEvents))
    
    if not os.path.exists(path):
        os.makedirs(path)

    for event in listEvents:
        cls = event["event-type"]
        ix = event["id"]
        if (cls in classes):
            true_cls = classes[cls]
            if (not true_cls in classInstances):
                classInstances[true_cls] = []
            #classInstances[true_cls].append(event)
            classInstances[true_cls].append(ix)
            
    print("Total events:" + str(len(listEvents)))
    
    #Write classes
    with open(os.path.join(path,"classes.txt"), 'w') as fout:
        for cla in classInstances:
            fout.write(cla + " : " + str(len(classInstances[cla])) + "\n")

    print("Writing dataset to " + path)         
    for fi in range(1,num_folders+1):

        train_path = os.path.join(path, str(fi), "train")
        test_path = os.path.join(path, str(fi), "test")

        #if not os.path.exists(sub_path):
        os.makedirs(train_path)
        os.makedirs(test_path)
        for cla in classInstances:
            os.makedirs(os.path.join(train_path, cla))
            os.makedirs(os.path.join(test_path, cla))
        
        train_events = []
        test_events = []
        random.seed(fi)
        for cl in classInstances:
            shuffle(classInstances[cl])
            idx = int(len(classInstances[cl])*test_train_ratio)
            print("Class: " + cl + " instances: " + str(len(classInstances[cl])) + " idx: " + str(idx))
            
            test_events_cl = classInstances[cl][:idx]
            train_events_cl = classInstances[cl][idx:]
            
            #Write datasets indexes wrt to the original files
            if (writeIndexes):
                with open(os.path.join(test_path, cl, "idx.txt"), 'w') as fout:
                    fout.write("\n".join(str(x) for x in test_events_cl))
                with open(os.path.join(train_path, cl, "idx.txt"), 'w') as fout:
                    fout.write("\n".join(str(x) for x in train_events_cl))

            #Write JSON files to disk
            if (writeJSON):
                test_data = {}
                train_data = {}
                #test_data['results'] = test_events
                test_data['results'] = list( listEvents[i] for i in test_events )
                #train_data['results'] = train_events
                train_data['results'] = list( listEvents[i] for i in train_events )
                if (GZIP):
                    with gzip.open(os.path.join(sub_path,"test.json.gzip"), 'w') as outfile:
                        outfile.write(json.dumps(test_data, indent=4).encode('utf-8'))
                    with gzip.open(os.path.join(sub_path, "train.json.gzip"), 'w') as outfile:  
                        outfile.write(json.dumps(train_data, indent=4).encode('utf-8'))
                else:
                    with open(sub_path + "/test.json", 'w') as outfile:  
                        json.dump(test_data, outfile, indent=4)
                    with open(sub_path + "/train.json", 'w') as outfile:  
                        json.dump(train_data, outfile, indent=4)            
            
            for t_e in train_events_cl:
                train_events.append(t_e) 
            for t_e in test_events_cl:
                test_events.append(t_e)
            
        print("train: " + str(len(train_events)))
        print("test: " + str(len(test_events)))
               

In [52]:
import os

%time

years_range = [2010,2018]
min_text_len = 50
dir_in='../events-allignement/data/wikipedia-events-portal/clean/json'
suffix = "_multilink_data_id_clean"
prefix = "wiki-events-"

%time listEvents = WE_2.filterEvents(WE_2.getEvents(years_range[0],years_range[1], dir_in, prefix, suffix), min_text_len)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 6.2 µs
loading file wiki-events-2010_multilink_data_id_clean.json.gz ...
found 12888 events...
total: 12888 events...
loading file wiki-events-2011_multilink_data_id_clean.json.gz ...
found 8874 events...
total: 21762 events...
loading file wiki-events-2012_multilink_data_id_clean.json.gz ...
found 6513 events...
total: 28275 events...
loading file wiki-events-2013_multilink_data_id_clean.json.gz ...
found 5675 events...
total: 33950 events...
loading file wiki-events-2014_multilink_data_id_clean.json.gz ...
found 3698 events...
total: 37648 events...
loading file wiki-events-2015_multilink_data_id_clean.json.gz ...
found 5921 events...
total: 43569 events...
loading file wiki-events-2016_multilink_data_id_clean.json.gz ...
found 6186 events...
total: 49755 events...
loading file wiki-events-2017_multilink_data_id_clean.json.gz ...
found 3212 events...
total: 52967 events...
loading file wiki-events-2018_multilink_data_id_clean.js

In [54]:

classes = {'armed conflicts and attacks':'armed conflicts and attacks', 
           'politics and elections':'politics and elections', 
           'law and crime':'law and crime', 'disasters and accidents':'disasters and accidents', 
           'international relations':'international relations', 
           'sport':'sport', 'business and economy':'business and economy', 
           'arts and culture':'arts and culture', 'science and technology':'science and technology',
           'science': 'science and technology', 
           #'health and medicine': 'health',
           #'health and environment': 'health',
          }
          
'''
classes = {'armed conflicts and attacks':'armed conflicts and attacks', 
           'politics and elections':'politics and elections', 
           'law and crime':'law and crime', 'disasters and accidents':'disasters and accidents', 
           #'international relations':'international relations', 
           'sport':'sport', 'business and economy':'business and economy', 
           'arts and culture':'arts and culture', 'science and technology':'science and technology',
           'science': 'science and technology', 
           #'health and medicine': 'health',
           #'health and environment': 'health',
          }
'''

num_classes = len(set(classes.values()))

num_folders = 30
GZIP = True
WriteIndexes = True
WriteJSON = False
dir_out='data/' + str(num_folders) + '-fold-' + str(num_classes) + '-classes-' + str(years_range[0]) + '-' + str(years_range[1])

%time k_fold_split(listEvents, classes, num_folders, dir_out, 0.2, WriteIndexes, WriteJSON, GZIP)

Total events:41560
Writing dataset to data/30-fold-9-classes-2010-2018
Class: armed conflicts and attacks instances: 9783 idx: 1956
Class: arts and culture instances: 2008 idx: 401
Class: business and economy instances: 2638 idx: 527
Class: disasters and accidents instances: 5575 idx: 1115
Class: international relations instances: 4187 idx: 837
Class: law and crime instances: 5982 idx: 1196
Class: politics and elections instances: 6640 idx: 1328
Class: sport instances: 2544 idx: 508
Class: science and technology instances: 1482 idx: 296
train: 32675
test: 8164
Class: armed conflicts and attacks instances: 9783 idx: 1956
Class: arts and culture instances: 2008 idx: 401
Class: business and economy instances: 2638 idx: 527
Class: disasters and accidents instances: 5575 idx: 1115
Class: international relations instances: 4187 idx: 837
Class: law and crime instances: 5982 idx: 1196
Class: politics and elections instances: 6640 idx: 1328
Class: sport instances: 2544 idx: 508
Class: science a

Class: business and economy instances: 2638 idx: 527
Class: disasters and accidents instances: 5575 idx: 1115
Class: international relations instances: 4187 idx: 837
Class: law and crime instances: 5982 idx: 1196
Class: politics and elections instances: 6640 idx: 1328
Class: sport instances: 2544 idx: 508
Class: science and technology instances: 1482 idx: 296
train: 32675
test: 8164
Class: armed conflicts and attacks instances: 9783 idx: 1956
Class: arts and culture instances: 2008 idx: 401
Class: business and economy instances: 2638 idx: 527
Class: disasters and accidents instances: 5575 idx: 1115
Class: international relations instances: 4187 idx: 837
Class: law and crime instances: 5982 idx: 1196
Class: politics and elections instances: 6640 idx: 1328
Class: sport instances: 2544 idx: 508
Class: science and technology instances: 1482 idx: 296
train: 32675
test: 8164
Class: armed conflicts and attacks instances: 9783 idx: 1956
Class: arts and culture instances: 2008 idx: 401
Class: b

In [None]:
def toFolders(listEvents, source_field, minConf, min_text_len, dir_out, instances_per_class):
    
    classInstances = {}

    for event in listEvents:
        #We select only events with associated Event-type and full-text
        if 'event-type' in event and 'full-text' in event:
            #keep only events with non empty full-text and event-type
            if event['full-text'] and event['event-type'] and len(event['full-text']) > min_text_len:
                label = event['event-type']
                if (label in classes):
                    entities = event['entities']
                    if source_field=='entities':
                        text = ''
                        for entity in entities:
                            avgconf = sum(float(i) for i in entity['confidence'])/len(entity['confidence'])
                            if avgconf > minConf:
                                #print(entity['label'] + " " + entity['spot'] + " " + str(entity['start']) + " " + str(entity['end']))
                                text = text + ' ' + entity['spot']
                    elif source_field=='short-text':
                        text = event['event']
                    elif source_field=='full-text':
                        text = event['full-text']
                    elif source_field=='entity-categories':
                        text_cats = ''
                        for entity in entities:
                            avgconf = sum(float(i) for i in entity['confidence'])/len(entity['confidence'])
                            if avgconf > minConf:
                                text_cats = text_cats + ' ' + " ".join(entity['categories']).replace("_"," ")
                    elif source_field=='combo':
                        text = event['full-text']
                        text_cats = ''
                        for entity in entities:
                            avgconf = sum(float(i) for i in entity['confidence'])/len(entity['confidence'])
                            if avgconf > minConf:
                                text_cats = text_cats + ' ' + " ".join(entity['categories']).replace("_"," ")
                    if (not classes[label] in classInstances):
                        classInstances[classes[label]] = []
                    
                    if (len(classInstances[classes[label]]) >= instances_per_class):
                        continue
                    
                    if source_field=='combo':
                        item = {"full-text": text, "categories": text_cats}
                    elif source_field=='full-text':
                        item = text
                    elif source_field=='entity-categories':
                        item = text_cats
                    
                    classInstances[classes[label]].append(item)
                    
    train_n = 0;
    test_n = 0;
    for cls in classInstances:
        insts = classInstances[cls]
        test_index = int(len(insts)*0.8)
        train = insts[:test_index]
        test = insts[test_index:]
        print(cls, len(insts))
    
        idx = 0;
        for item_tr in train:
            out_path = os.path.join(dir_out + "/train", cls)
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            if source_field=='combo':
                out_path_full_text = out_path + "/full-text"
                if not os.path.exists(out_path_full_text):
                    os.makedirs(out_path_full_text)
                out_path_categories = out_path + "/categories"
                if not os.path.exists(out_path_categories):
                    os.makedirs(out_path_categories)
                with open(os.path.join(out_path_full_text, str(idx)), 'a') as the_file:
                    the_file.write(item_tr["full-text"]) 
                with open(os.path.join(out_path_categories, str(idx)), 'a') as the_file:
                    the_file.write(item_tr["categories"]) 
            else:
                with open(os.path.join(out_path, str(idx)), 'a') as the_file:
                    the_file.write(item_tr)                   
            idx += 1
            train_n += 1
        print("train: " + str(idx))
        idx = 0
        for item_ts in test:
            out_path = os.path.join(dir_out + "/test", cls)
            if not os.path.exists(out_path):
                os.makedirs(out_path)
            if source_field=='combo':
                out_path_full_text = out_path + "/full-text"
                if not os.path.exists(out_path_full_text):
                    os.makedirs(out_path_full_text)
                out_path_categories = out_path + "/categories"
                if not os.path.exists(out_path_categories):
                    os.makedirs(out_path_categories)
                with open(os.path.join(out_path_full_text, str(idx)), 'a') as the_file:
                    the_file.write(item_ts["full-text"]) 
                with open(os.path.join(out_path_categories, str(idx)), 'a') as the_file:
                    the_file.write(item_ts["categories"]) 
            else:
                with open(os.path.join(out_path, str(idx)), 'a') as the_file:
                    the_file.write(item_ts)
            idx += 1
            test_n += 1
        print("test: " + str(idx))
    print("total train: " + str(train_n))
    print("total test: " + str(test_n))

In [39]:
import os
years_range = [2010,2010]
source_field = "combo" #'entity-categories'
min_text_len = 50
instances_per_class = 50 #1207
suffix = "_multilink_data_clean"
data_dir='../events-allignement/data/wikipedia-events-portal/clean/json'
#dir_out= 'Text_Classifier/.data/WE_clean_balanced_' + str(instances_per_class) + '/' + str(years_range[0]) + '-' + str(years_range[1]) + '-' + source_field
dir_out= 'Text_Classifier/.data/WE_clean_combo_' + str(instances_per_class) + '/' + str(years_range[0]) + '-' + str(years_range[1]) + '-' + source_field
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
%time toFolders(source_field,years_range[0],years_range[1],0.6, 50, data_dir, suffix, dir_out, instances_per_class)

loading file wiki-events-2010_multilink_data_clean.json.gz ...
found 12888 events...
total: 12888 events...
Selected total events: 0
disasters and accidents 50
train: 40
test: 10
arts and culture 50
train: 40
test: 10
armed conflicts and attacks 50
train: 40
test: 10
politics and elections 50
train: 40
test: 10
law and crime 50
train: 40
test: 10
business and economy 50
train: 40
test: 10
science and technology 50
train: 40
test: 10
sport 50
train: 40
test: 10
total train: 320
total test: 80
CPU times: user 21 s, sys: 5.78 s, total: 26.8 s
Wall time: 27.4 s
