In [1]:
import json
import random
import itertools
import pandas as pd
import networkx as nx
from collections import Counter, OrderedDict

import matplotlib.pyplot as plt
from pyvis.network import Network


In [2]:
with open("./data/subtask3-coreference/en-train.json") as fp:
    entries = fp.readlines()

In [3]:
data = []
for entry in entries:
    data.append(json.loads(entry))

In [4]:
print("Examples in the dataset:")
random.sample(data, 3)

Examples in the dataset:


[{'event_clusters': [[21], [1, 2, 13, 15, 16, 19]],
  'sentence_no': [1, 2, 13, 15, 16, 19, 21],
  'sentences': ['The strike at Anglo American Platinum ( Amplats ) in Rustenburg will continue , the Rustenburg strike co-ordinating committee said on Monday .',
   '`` The strike is on .',
   'Amplats said it had offered to reinstate all workers on the same terms and conditions of employment as applied before their illegal strike .',
   'The company had further offered a once-off hardship allowance of R2000 to help workers deal with financial difficulties arising from the no-work , no-pay principle in place while they were striking .',
   'Workers who did not participate in the strike would receive a once-off loyalty allowance of R2000 .',
   'Amplats workers went on strike on September 12 , demanding a monthly salary of R16,000 .',
   "`` Prolonged strikes may lead to the company closing down , '' he said after a Congress of SA Trade Unions rally in Rustenburg ."],
  'id': 55314},
 {'even

In [5]:
print("Number of entries")
len(data)

Number of entries


596

In [6]:
sentence_no_list = []
event_clusters_list = []

for d in data:
    sentence_no_list += d['sentence_no']
    event_clusters_list += d['event_clusters']

In [7]:
print("The maximum number of sentence_no: %d" % max(sentence_no_list))

The maximum number of sentence_no: 90


In [8]:
def get_event_clusters_comb(clusters_list, comb_elements_num):
    """Decompose the clusters and recompose them with at most comb_elements_num elements
    """
    event_clusters_comb = []
    for e in clusters_list:
        if len(e) >= comb_elements_num:
            event_clusters_comb += itertools.combinations(e, comb_elements_num)
        else:
            event_clusters_comb += (e,)

    event_clusters_counter = Counter(map(tuple, event_clusters_comb)).most_common()
    return event_clusters_counter

In [9]:
get_event_clusters_comb(event_clusters_list, 1)[:10]

[((1,), 396),
 ((2,), 296),
 ((3,), 247),
 ((4,), 210),
 ((6,), 183),
 ((5,), 170),
 ((7,), 139),
 ((8,), 113),
 ((9,), 107),
 ((10,), 87)]

In [10]:
get_event_clusters_comb(event_clusters_list, 2)[:10]

[((1, 2), 218),
 ((1, 3), 155),
 ((1, 4), 128),
 ((2, 3), 123),
 ((3, 4), 98),
 ((1, 5), 96),
 ((1, 6), 91),
 ((2, 4), 88),
 ((1, 7), 72),
 ((2, 5), 69)]

In [11]:
get_event_clusters_comb(event_clusters_list, 3)[:10]

[((1, 2, 3), 97),
 ((1, 3, 4), 68),
 ((1, 2, 4), 66),
 ((1, 2, 5), 56),
 ((2, 3, 4), 53),
 ((1, 2, 6), 52),
 ((1, 3, 5), 51),
 ((1, 4, 5), 44),
 ((1, 3, 6), 40),
 ((2, 3, 5), 39)]

In [12]:
get_event_clusters_comb(event_clusters_list, 4)[:10]

[((1, 2, 3, 4), 40),
 ((6,), 38),
 ((1, 2), 36),
 ((1, 2, 3, 5), 32),
 ((1, 3, 4, 5), 29),
 ((1, 2, 3, 6), 27),
 ((3,), 26),
 ((1, 2, 4, 5), 25),
 ((2, 3, 4, 5), 23),
 ((2,), 22)]

In [13]:
get_event_clusters_comb(event_clusters_list, 5)[:10]

[((6,), 38),
 ((1, 2), 36),
 ((3,), 26),
 ((2,), 22),
 ((7,), 22),
 ((9,), 22),
 ((1, 2, 3), 19),
 ((5,), 18),
 ((1, 2, 3, 4, 5), 17),
 ((8,), 17)]

In [14]:
def get_cooccurrences(event_clusters_list, sentence_no_range=range(1,91)):
    # occurrences = OrderedDict((sn, OrderedDict((sn, 0) for sn in sentence_no_range)) for sn in sentence_no_range)
    results = pd.DataFrame(index=sentence_no_range, columns=sentence_no_range)
    results = results.fillna(0)

    # Find the co-occurrences:
    for l in event_clusters_list:
        for i in range(len(l)):
            for item in l[:i] + l[i + 1:]:
                # occurrences[l[i]][item] += 1
                _tmp = results.at[l[i], item] + 1
                results.at[l[i], item] = _tmp
    return results

In [15]:
cooccurrence_df = get_cooccurrences(event_clusters_list)
cooccurrence_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,81,82,83,84,85,86,87,88,89,90
1,0,218,155,128,96,91,72,57,48,40,...,0,0,0,0,0,0,0,0,0,0
2,218,0,123,88,69,65,42,41,32,32,...,1,0,0,0,0,0,0,0,0,1
3,155,123,0,98,68,52,35,27,26,23,...,0,0,0,0,0,0,0,0,0,0
4,128,88,98,0,65,51,29,31,31,23,...,1,0,0,0,0,0,0,0,0,1
5,96,69,68,65,0,43,21,24,20,16,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# Find if there are some events always alone (have no connection with others)
occur_alone = cooccurrence_df[cooccurrence_df==0].dropna().index.to_list()


In [17]:
bi_comb = get_event_clusters_comb(event_clusters_list, 2)
bi_comb_dict = dict(bi_comb)
for k in occur_alone:
    bi_comb_dict.pop(k, None)

In [36]:
# Create network plot
G = nx.Graph()

# Create connections between nodes
for k, v in bi_comb_dict.items():
    if len(k) >= 2 and v > 0:
        G.add_edge(k[0], k[1], weight=(v * 20))


In [37]:
g = Network(height=800, width=800, notebook=True)
# g.toggle_hide_edges_on_drag(True)
g.barnes_hut()
g.from_nx(G)
g.save_graph("subtask3_event_clusters.html")


In [38]:
# Does not work in PyCharm, but should work in a native browser
from IPython.display import IFrame

IFrame("subtask3_event_clusters.html", width=900, height=900)

In [67]:
supplement_alone = [[85,86,87], [57,64]]
for sp in supplement_alone:
    occur_alone += sp
    occur_alone = list(set(occur_alone))

# TF-IDF analysis

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline

In [80]:
corpus = [
    "Rs 2,200 cr Transactions Hit as Staff Strike Shuts Banks Across the State 19th December 2013 11:20 AM Cheque transactions worth more than ` 2,200 crore were hit as more than 1,400 bank branches remained closed as bank employees went on a day-long strike to protest against banking sector reforms and non-implementation of wage revision for more than a year .",
    "Two die in land invasion scuffles in Rus-ter-vaal , south of Joburg ANA Reporter JOHANNESBURG , 10 April ( ANA ) - Emfuleni Local Municipality in Gauteng has said that it regrets the loss of life following incidents of violence in Rus-ter-vaal earlier on Wednesday morning in which two people died in alleged clashes between security agency the Red Ants and land invaders in the area .","People invaded the land in Rus-ter-vaal in the previous weeks and the Emfuleni Municipality had obtained a court order to evict them ."
]
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)
X

<3x262 sparse matrix of type '<class 'numpy.float64'>'
	with 272 stored elements in Compressed Sparse Row format>

In [82]:
truncat = TruncatedSVD(n_components = 100)

In [84]:
TR = truncat.fit(X)

In [85]:
TR.

TruncatedSVD(n_components=100)

In [73]:

text_encoder = Pipeline([
    ('Text-TF-IDF', TfidfVectorizer(ngram_range=(1, 3))),
    ('Text-SVD', TruncatedSVD(n_components = 100))], verbose=True)

# Span analysis

In [39]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")

model = AutoModel.from_pretrained("SpanBERT/spanbert-base-cased")

Downloading:   0%|          | 0.00/413 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [49]:
tl = ["MC officials beat hasty retreat as mob attacks team - Indian Express Express News Service , Express News Service : Kharar , Wed Mar 18 2009 , 03:33 hrs Officials of the Kharar Municipal Council , on an anti-encroachment drive at Santemajra village , had to duck for cover following a fierce attack by residents on Tuesday evening .","A team of around a dozen enforcement staffers , led by Assistant Municipal Engineer Baldev Raj Verma , had gone to free prime MC land ( measuring 4 acres and worth Rs 100 crore ) when a mob of around 150 encroachers pounced on them , forcing them to flee in a bid to save their lives .","A couple of months ago , MC staffers had to retreat after a mob attacked them in Fatehullapur village , Verma told Newsline .","When they failed to comply , a team of MC officials was dispatched to serve a final notice which came under attack by the mob ."]
t = "\n".join(tl)

In [54]:
t_tokenized = tokenizer(t)

In [68]:
# sentence_no_not_alone
sentence_no_new = list(set(sentence_no_list) - set(occur_alone))