In [1]:
import json
import nltk
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import gensim

In [2]:
#Loading training dataset as 'data' from .jsonl files (NOTE: FILE PATHS MAY NEED TO BE CHANGED ACCORDINGLY)
fileNames = ['C:/Users/Gayle/Dropbox/DSA/Dissertation/Appendix/Wikievents dataset/gen-arg-main/data/wikievents/dev.jsonl', 
             'C:/Users/Gayle/Dropbox/DSA/Dissertation/Appendix/Wikievents dataset/gen-arg-main/data/wikievents/test.jsonl', 
             'C:/Users/Gayle/Dropbox/DSA/Dissertation/Appendix/Wikievents dataset/gen-arg-main/data/wikievents/train.jsonl']
data = []

for file in fileNames:
    d = [json.loads(event) for event in open(file, 'r')]
    data.extend(d)

In [3]:
#Verifying data has been loaded correctly
print("Count of data: " + str(len(data)) + "; Type of Data: " + str(type(data))) #Checking all three files were loaded as a list
print(data[:2]) #checking head of the data
print(data[-2:]) #checking tail of data

Count of data: 246; Type of Data: <class 'list'>
[{'doc_id': 'scenario_en_kairos_13', 'tokens': ['primary', 'subject', 'Accompanying', 'Timothy', 'McVeigh', 'in', 'the', 'Ryder', 'truck', 'used', 'to', 'deliver', 'the', 'bomb', 'to', 'the', 'Murrah', 'Federal', 'Building', 'on', '19', 'April', '1995', ';', 'Stepping', 'out', 'of', 'the', 'Ryder', 'truck', 'at', 'ground', 'zero', 'minutes', 'before', 'the', 'blast', 'Speeding', 'away', 'from', 'downtown', 'Oklahoma', 'City', 'immediately', 'after', 'the', 'detonation', 'of', 'the', 'truck', 'bomb', ';', 'Being', 'seen', 'in', 'the', 'company', 'of', 'Timothy', 'McVeigh', 'a', 'various', 'times', 'and', 'locations', 'prior', 'to', '19', 'April', '1995', '.', 'Boston', 'Logan', 'International', 'Airport', '.', 'At', 'that', 'time', ',', 'he', 'resided', 'with', 'two', 'Iraqi', 'men', '(', 'brothers', ')', 'who', 'provided', 'food', 'catering', 'services', 'for', 'the', 'commercial', 'airlines', 'at', 'Boston', 'Logan', 'during', 'the', 't

In [4]:
#Data Preprocessing: White Space Removal and Tokenization of the complete text of the WikiEvent reports

i=0
tokens = []
tokenizer = nltk.tokenize.WordPunctTokenizer()

while i < len(data):
    text = data[i]['text']
    tokens.append(tokenizer.tokenize(text.strip()))
    i+=1
    
#checking tokenization
print(tokens)



In [5]:
#Data Preprocessing: Punctuation Removal, Numbers Removal and Stop Words Removal

punctuation = list(string.punctuation)
stopWords = set(stopwords.words('english'))
cleanTokens = []
i=0

while i < len(tokens):
    ct = []
    for t in tokens[i]:
        t= t.replace("’","")
        t= t.replace("”","")
        t= t.replace("“","")
        t= t.translate(str.maketrans('','',string.punctuation))
        if t.lower() not in stopWords and t not in punctuation and t != '' and not t.isdigit():
            ct.append(t) 
    cleanTokens.append(ct) 
    i+=1
        
#checking removal
print(cleanTokens)




In [6]:
#Data Preprocessing: Lemmatizing using WordNetLemmatizer

newTokens = []
i=0

while i < len(cleanTokens):
    stemmer = nltk.stem.WordNetLemmatizer()
    newTokens.append(" ".join(stemmer.lemmatize(token) for token in cleanTokens[i]))      
    i+=1

print(newTokens)



In [7]:
#TF-IDF Modeling on Training Data

tfidf = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1,5)) #creating the TFIDF Vectorizer object
keywords = tfidf.fit_transform(newTokens) #creating the TFIDF model
pd.DataFrame(keywords.todense(), columns=tfidf.get_feature_names_out()) #saving the model scores and words in a dataframe
tfidf.vocabulary_ #checking the TFIDF model vocabulary

{'roadside': 2429,
 'ied': 1332,
 'kill': 1520,
 'russian': 2460,
 'major': 1687,
 'general': 1175,
 'south': 2660,
 'east': 910,
 'regime': 2328,
 'syria': 2789,
 'took': 2886,
 'heavy': 1264,
 'toll': 2885,
 'week': 3094,
 'nation': 1846,
 'defense': 778,
 'ministry': 1793,
 'confirmed': 626,
 'improvised': 1349,
 'explosive': 1002,
 'device': 830,
 'al': 83,
 'monitor': 1813,
 'online': 1960,
 'reported': 2365,
 'died': 837,
 'detonated': 818,
 'convoy': 667,
 'soldier': 2646,
 'syrian': 2790,
 'pro': 2205,
 'near': 1861,
 'city': 530,
 'military': 1784,
 'personnel': 2061,
 'wounded': 3138,
 'russia': 2459,
 'state': 2700,
 'run': 2455,
 'news': 1883,
 'agency': 63,
 'local': 1639,
 'commander': 587,
 'national': 1847,
 'forces': 1127,
 'reportedly': 2366,
 'changed': 492,
 'tactic': 2792,
 'source': 2657,
 'declined': 768,
 'named': 1845,
 'security': 2539,
 'reason': 2299,
 'lay': 1580,
 'video': 3035,
 'explosion': 998,
 'social': 2639,
 'medium': 1751,
 'death': 753,
 'lieutena

In [8]:
#saving TFIDF vocabulary for testing on Abstracts dataset
pickle.dump(tfidf.vocabulary_,open("tfidf.pkl","wb"))

In [9]:
#Word2Vec Modeling on Training data

w2vModel = gensim.models.Word2Vec() #creating the model object
w2vModel.build_vocab(cleanTokens, update=False) #building the model vocabulary from the cleaned tokens
w2vModel.train(cleanTokens, total_examples=100000, epochs=5) #training the model from the cleaned tokens
w2vModel.wv["government"] #testing the model to extract the vectorized array for the word 'government'

array([-0.44792804,  0.4769985 ,  0.4484432 ,  0.3024793 ,  0.11960114,
       -1.1179136 ,  0.42489412,  1.2347416 , -0.3523508 , -0.42052725,
       -0.31127828, -1.1527116 , -0.07055487,  0.44368166,  0.06531638,
       -0.41332155,  0.14153568, -0.99907315, -0.02813099, -1.2575532 ,
        0.44745818,  0.3171835 ,  0.24983947, -0.32838416, -0.03751803,
        0.05939516, -0.64047265, -0.05455982, -0.72410727, -0.08067574,
        0.7662469 ,  0.2041041 , -0.16028205, -0.25225076, -0.36149296,
        0.49555555, -0.08465967, -0.6781457 , -0.48746002, -1.0957774 ,
       -0.14770831, -0.5749371 , -0.24497034, -0.00794168,  0.464719  ,
       -0.24842529, -0.30759013,  0.11465313, -0.00507582,  0.39042175,
        0.20278925, -0.5937583 , -0.5761344 , -0.05235121, -0.6300931 ,
       -0.03898916,  0.15576077,  0.08719575, -0.63940084,  0.33160588,
       -0.03319926,  0.26160374, -0.45498437,  0.00816465, -0.9045635 ,
        0.52396196,  0.26116127,  0.59713703, -0.84513015,  0.66

In [10]:
w2vModel.wv.most_similar("government") #evaluating the Word2Vec model's effectiveness at prdicting similar words

[('country', 0.99970942735672),
 ('state', 0.9996606111526489),
 ('military', 0.9996576905250549),
 ('drones', 0.9996492266654968),
 ('opposition', 0.9996463060379028),
 ('president', 0.9996421337127686),
 ('US', 0.9996330738067627),
 ('Venezuela', 0.9996219873428345),
 ('Pakistan', 0.9996203780174255),
 ('even', 0.9996199607849121)]

In [11]:
#saving the Word2Vec model for testing on the Abstracts Dataset
w2vModel.save("word2vec.model")