In [1]:
import numpy as np
import pandas as pd

In [2]:
df_ = pd.read_csv('../data/events.csv')

In [4]:
df_.head()

Unnamed: 0,Event ID,Event Date,Source Name,Source Sectors,Source Country,Event Text,CAMEO Code,Intensity,Target Name,Target Sectors,Target Country,Story ID,Sentence Number,Publisher,City,District,Province,Country,Latitude,Longitude
0,20718170,2014-01-01,Police (Australia),"Police,Government",Australia,"Arrest, detain, or charge with legal action",173,-5.0,Men (Australia),"Social,General Population / Civilian / Social",Australia,32493690,2,Daily Telegraph,Surfers Paradise,Gold Coast,State of Queensland,Australia,-28.0027,153.43
1,20718171,2014-01-01,Police (Australia),"Police,Government",Australia,"Arrest, detain, or charge with legal action",173,-5.0,Children (Australia),"Social,General Population / Civilian / Social",Australia,32493693,1,Daily Telegraph,Maroubra,Randwick,State of New South Wales,Australia,-33.95,151.233
2,20718172,2014-01-01,Government Official (Democratic Republic of Co...,Government,Democratic Republic of Congo,Make statement,10,0.0,Attacker (Democratic Republic of Congo),"Criminals / Gangs,Dissident",Democratic Republic of Congo,32495112,3,The Australian,Kinshasa,,Kinshasa City,Democratic Republic of Congo,-4.32142,15.3081
3,20718174,2014-01-01,Military (South Sudan),"Military,Government",South Sudan,Use conventional military force,190,-10.0,Armed Rebel (South Sudan),"Rebel,Dissident",South Sudan,32495113,1,The Australian,Juba,,Central Equatoria State,South Sudan,4.85165,31.5825
4,20718173,2014-01-01,Armed Rebel (South Sudan),"Rebel,Dissident",South Sudan,Use unconventional violence,180,-9.0,Military (South Sudan),"Military,Government",South Sudan,32495113,1,The Australian,Juba,,Central Equatoria State,South Sudan,4.85165,31.5825


In [5]:
len(df_)

120000

In [6]:
# Source and Target Sectors are kind of redundant with Source and Target Names
cols = ['Source Name', 'Source Country', 'Event Text', 'Target Name', 'Target Country']
df = df_.loc[:, cols]

# the category label space is massive, this reduces it
df['Source Name'].replace(to_replace='\s\(.+', value='', inplace=True, regex=True)
df['Target Name'].replace(to_replace='\s\(.+', value='', inplace=True, regex=True) 

# NaN values are when source/target don't have a country
# df[df.isnull().any(axis=1)]
df.fillna('NONE', inplace=True)

df.head()

Unnamed: 0,Source Name,Source Country,Event Text,Target Name,Target Country
0,Police,Australia,"Arrest, detain, or charge with legal action",Men,Australia
1,Police,Australia,"Arrest, detain, or charge with legal action",Children,Australia
2,Government Official,Democratic Republic of Congo,Make statement,Attacker,Democratic Republic of Congo
3,Military,South Sudan,Use conventional military force,Armed Rebel,South Sudan
4,Armed Rebel,South Sudan,Use unconventional violence,Military,South Sudan


In [7]:
cat_map = {}
for col in cols:
    cat = df.loc[:,col].astype('category')
    cat_map[col] = cat.cat.categories
    df.loc[:,col] = cat.cat.codes

In [8]:
cat_map['Source Country'][df['Source Country'][0]]

'Australia'

In [9]:
n_samples = len(df)
data = df.as_matrix()
data

array([[3589,    9,   39, 2278,    8],
       [3589,    9,   39,  688,    8],
       [1634,   49,  160,  428,   49],
       ..., 
       [1389,  204,  156, 3705,  203],
       [3646,  201,  170, 2879,  203],
       [3589,  201,  225, 2930,  203]], dtype=int16)

In [10]:
Y = data[1:]
Y.shape

(119999, 5)

In [11]:
data = data[:-1]
data.shape

(119999, 5)

In [12]:
vocab_size = np.max(data) + 1
vocab_size

4887

In [None]:
# check how many GB of memory will be needed
# using int16
print(Y.shape[0] * Y.shape[1] * (Y.max() + 1) * 16 * 1.25e-10)

y_onehot = np.zeros((Y.shape[0], Y.shape[1], Y.max() + 1))
layer_idx = np.arange(Y.shape[0]).reshape(Y.shape[0], 1)
component_idx = np.tile(np.arange(Y.shape[1]), (Y.shape[0], 1))
y_onehot[layer_idx, component_idx, Y] = 1

In [None]:
from keras.layers.embeddings import Embedding
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.layers.wrappers import TimeDistributed

embed_size = 100
batch_size = 128
epochs = 100

model = Sequential()
model.add(Embedding(vocab_size, embed_size, input_length=data.shape[1]))
model.add(Dropout(0.3))
model.add(LSTM(embed_size, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(embed_size, return_sequences=True))
model.add(Dropout(0.3))
model.add(TimeDistributed(Dense(vocab_size, activation='softmax')))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])



In [None]:
model.fit(data, y_onehot, batch_size=batch_size, nb_epoch=epochs, validation_split=0.05)
model.save_weights('../data/icews_weights.h5', overwrite=False)

In [None]:
model.load_weights('../data/icews_weights.h5')

In [15]:
def render(arr):
    parts = []
    for idx, col in enumerate(cols):
        parts.append(cat_map[col][int(arr[idx])])
    return parts

In [39]:
def ramble(seed, n):
    rambling = [seed]
    for _ in range(n):
        # TODO not actually feeding in the full sequence?
        probs = model.predict(seed.T)[0]
        seed = []
        # random draw based on probs
        cs = np.cumsum(probs, axis=1)
        for row in cs:          
            idx = row.searchsorted(np.random.random() * row[-1], 'right')
            seed.append(idx)
        rambling.append(render(seed))
        seed = np.array(seed).reshape(5,1)
    return rambling

In [44]:
seed = np.random.randint(0, 20, (5,1))
for source_name, source_country, event_text, target_name, target_country in ramble(seed, 10):
    print('{} ({}) -> {} -> {} ({})'.format(source_name, source_country, event_text, target_name, target_country))

Citizen (Turkey) -> Protest violently, riot -> Argentina (Italy)
Communist Party of India (Turkey) -> Refuse to release persons or property -> Activist (Indonesia)
High Ranking Military Personnel (NONE) -> Make statement -> Militant (Switzerland)
Domestic Affairs (South Korea) -> Accuse -> Indonesia (Turkey)
Media (South Korea) -> Appeal for judicial cooperation -> Japan (India)
Protester (United Kingdom) -> Consult -> Men (United States)
Citizen (Australia) -> Consult -> Protester (Thailand)
Barack Obama (Thailand) -> Consult -> Men (South Korea)
Barack Obama (United States) -> Consult -> Unspecified Actor (United States)
Barack Obama (France) -> Cooperate economically -> Lakhdar Brahimi (Saudi Arabia)
