In [None]:
"""

ANNOTATION SCIRPT/FORMAT/PROCEDURE FROM:

https://github.com/hollyjackson/casualty_mentions_nyt/tree/main

"""

In [None]:
import sys

import os
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup
import random
import matplotlib as mpl
import matplotlib.pyplot as plt
import json
import locale

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
import datetime as dt

# NLTK imports
import nltk

nltk.data.path.append('../nltk_data/')
import string
from nltk import collocations
from nltk.text import Text
from nltk.tokenize import WhitespaceTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk import RegexpParser
from nltk.tree import *

# spaCy imports
import spacy
from spacy.symbols import nsubj, VERB

from time import sleep

In [None]:
PALESTINE_IDENTIFIERS = ["Palestine", "Palestinian", "Palestinians"]
ISRAEL_IDENTIFIERS = ["Israel", "Israeli", "Israelis"]

# Cities in the West Bank and Gaza
PALESTINIAN_CITIES = ["Gaza", "Gaza Strip", "Jerusalem", "Abasan al-Kabira", "Abu Dis", "Bani Na\'im", "Bani Suheila",
                      "Beit Hanoun", "Beit Jala", "Beit Lahia", "Beit Sahour", "Beit Ummar", "Beitunia", "Bethlehem",
                      "Beit Lahm", "al-Bireh", "Deir al-Balah", "ad-Dhahiriya", "Dura", "Gaza City", "Ghazzah",
                      "Halhul",
                      "Hebron", "al-Khalil", "Idhna", "Jabalia", "Jenin", "Jericho", "Ariha", "Khan Yunis", "Nablus",
                      "Qabatiya", "Qalqilya", "Rafah", "Ramallah", "Sa\'ir", "as-Samu", "Surif", "Tubas", "Tulkarm",
                      "Ya\'bad", "al-Yamun", "Yatta", "az-Zawayda", "Nazareth", "Jaljulia", "Kafr Bara", "Kafr Qasim",
                      "Qalansawe", "Tayibe", "Tira", "Zemer", "Ar\'ara", "Baqa al-Gharbiyye", "al-Arian", "Basma",
                      "Jatt",
                      "Kafr Qara", "Ma\'ale Iron", "Meiser", "Umm al-Fahm", "Umm al-Qutuf", "Lod", "Ramla",
                      "Wadi Nisnas",
                      "Halisa", "Kababir", "Abbas", "Daliyat al-Karmel", "Ein Hawd", "Fureidis", "Ibtin", "Isfiya",
                      "Jisr az-Zarqa", "Khawaled", "Abu Ghosh", "Beit Jimal", "Ein Naqquba", "Ein Rafa", "Beit Hanina",
                      "Beit Safafa", "Jabel Mukaber", "Old City", "Ras al-Amud", "Sheikh Jarrah", "Shuafat", "Silwan",
                      "Sur Baher", "At-Tur", "Umm Tuba", "Wadi al-Joz", "al-Walaja", "Abu Qrenat", "Abu Talul",
                      "Ar\'arat an-Naqab", "Ateer", "al-Atrash", "Bir Hadaj", "Dhahiyah", "Drijat", "Ghazzah", "Hura",
                      "Kukhleh", "Kuseife", "Lakiya", "Makhul", "Mitnan", "Mulada", "Qasr al-Sir", "Rahat", "al-Sayyid",
                      "Shaqib al-Salam", "Tirabin al-Sana", "Tel as-Sabi", "Umm Batin", "Abu Sinan", "Arab al-Aramshe",
                      "Arab al-Subeih", "Arab al-Na\'im", "Arraba", "Basmat Tab\'un", "Beit Jann", "Bi\'ina",
                      "Bir al-Maksur", "Bu\'eine Nujeidat", "Buqei\'a", "Daburiyya", "Ed Dahi", "Deir al-Asad",
                      "Deir Hanna", "Dmeide", "Eilabun", "Ein al-Asad", "Ein Mahil", "Fassuta", "Hamaam", "Hamdon",
                      "Hurfeish", "Hussniyya", "I\'billin", "Iksal", "Ilut", "Jadeidi-Makr", "Jish", "Julis",
                      "Ka\'abiyye-Tabbash-Hajajre", "Kabul", "Kafr Kanna", "Kafr Manda", "Kafr Misr", "Kafr Yasif",
                      "Kamanneh", "Kaukab Abu al-Hija", "Kfar Kama", "Kisra-Sumei", "Maghar", "Majd al-Krum",
                      "Manshiya Zabda", "Mashhad", "Mazra\'a", "Mi\'ilya", "Muqeible", "Nahf", "Na\'ura", "Nazareth",
                      "Nein", "Rameh", "Ras al-Ein", "Rehaniya", "Reineh", "Rumana", "Rumat al-Heib", "Sajur",
                      "Sakhnin",
                      "Sallama", "Sandala", "Sha\'ab", "Shefa-\'Amr", "Sheikh Danun", "Shibli–Umm al-Ghanam", "Sulam",
                      "Suweid Hamira", "Tarshiha", "Tamra City", "Tamra Village", "Tuba-Zangariyye", "Tur\'an", "Uzeir",
                      "Yafa an-Naseriyye", "Yanuh-Jat", "Yarka", "Zarzir", "Bani Suheila", "Beit Hanoun", "Beit Lahiya",
                      "Deir al-Balah", "Jabalia", "Khan Yunis", "Rafah"]

# Cities in Israel ('48 lands)
ISRAELI_CITIES = ["Acre", "Afula", "Arad", "Arraba", "Ashdod", "Ashkelon", "Baqa al-Gharbiyye", "Bat Yam", "Beersheba",
                  "Beit She\'an", "Beit Shemesh", "Bnei Brak", "Dimona", "Eilat", "El\'ad", "Giv\'at Shmuel",
                  "Givatayim",
                  "Hadera", "Haifa", "Herzliya", "Hod HaSharon", "Holon", "Jerusalem", "Kafr Qasim", "Karmiel",
                  "Kfar Saba",
                  "Kfar Yona", "Kiryat Ata", "Kiryat Bialik", "Kiryat Gat", "Kiryat Malakhi", "Kiryat Motzkin",
                  "Kiryat Ono",
                  "Kiryat Shmona", "Kiryat Yam", "Lod", "Ma\'alot-Tarshiha", "Migdal HaEmek",
                  "Modi\'in-Maccabim-Re\'ut",
                  "Nahariya", "Nazareth", "Nesher", "Ness Ziona", "Netanya", "Netivot", "Nof HaGalil", "Ofakim",
                  "Or Akiva",
                  "Or Yehuda", "Petah Tikva", "Qalansawe", "Ra\'anana", "Rahat", "Ramat Gan", "Ramat HaSharon", "Ramla",
                  "Rehovot", "Rishon LeZion", "Rosh HaAyin", "Safed", "Sakhnin", "Sderot", "Shefa-\'Amr", "Tamra",
                  "Tayibe",
                  "Tel Aviv-Yafo", "Tel Aviv", "Tiberias", "Tira", "Tirat Carmel", "Umm al-Fahm", "Yavne",
                  "Yehud-Monosson",
                  "Yokneam Illit"]

PALESTINE_MEMBER_AFFILIATIONS = PALESTINE_IDENTIFIERS + PALESTINIAN_CITIES
ISRAEL_MEMBER_AFFILIATIONS = ISRAEL_IDENTIFIERS + ISRAELI_CITIES

In [None]:
nlp = spacy.load('en_core_web_sm')

# enumerate spacy subject types
SUBJECTS = ["nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"]

# 27. VB  Verb, base form
# 28. VBD Verb, past tense
# 29. VBG Verb, gerund or present participle
# 30. VBN Verb, past participle
# 31. VBP Verb, non-3rd person singular present
# 32. VBZ Verb, 3rd person singular present
VERBS = ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
ADJECTIVES = ["JJ"]

# Enumerate common words related to death for automated sentence tagging
FATAL_ADJECTIVES = ["dead", "deceased", "buried", "killed"]
FATAL_VERBS_PASSIVE = ["die", "decease"]
FATAL_VERBS_ACTIVE = ["kill", "murder", "massacre", "shoot", "assassinate", "stab", "slash"]
FATAL_VERBS_ACTIVE_SPECIFIC = ["behead", "slaughter", "execute", "hang"]
ALL_FATAL_VERBS = FATAL_VERBS_PASSIVE + FATAL_VERBS_ACTIVE + FATAL_VERBS_ACTIVE_SPECIFIC

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Choosing a dataset
sample_size = 1

# Defining the dataset path
results_prefix = "./results"

input_files = os.listdir(results_prefix + '/')

In [None]:
def follow_compound(dep_idx, dependencies_by_governor):
    # Follow compound chain and return descriptors of a dependency
    visited = set([dep_idx])
    current_idx = dep_idx
    descriptors = set()

    found = True
    while found:
        found = False
        for dep in dependencies_by_governor[current_idx]:
            if dep["dep"] == "compound:prt" or dep["dep"] == "compound":
                current_idx = dep["dependent"]
                descriptors.add(dep["dependentGloss"])
                if current_idx not in visited:
                    found = True
                    visited.add(current_idx)
                break
    return descriptors

In [None]:
def investigate_subject(subject, dependencies_by_governor):
    # Investigate all dependencies related to a subject to find as many descriptors as possible
    # Present in a tiered list based on "closeness" to subject

    verbose = False

    subject_descriptors = [set(), set(), set(), 1]
    subject_descriptors[0].add(subject[1])
    # AMOD takes precedence over NMOD? takes precedence over ACL
    for dep in dependencies_by_governor[subj_idx]:
        if (dep["dep"] == "amod"):
            subject_descriptors[0].add(dep["dependentGloss"])
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "acl" or dep["dep"] == "acl:relcl"):
            subject_descriptors[1].add(dep["dependentGloss"])
            # INVESTIGATE THE SUBJECT AND OBJECT OF DESCRIPTIVE CLAUSE
            if verbose:
                print(dep["dep"], dep["dependentGloss"])
                print(dependencies_by_governor[dep["dependent"]])
                print()
            for double_dep in dependencies_by_governor[dep["dependent"]]:
                # check nsubj and check obj
                if (double_dep["dep"] == "nsubj" or double_dep["dep"] == "nsubj:pass" or double_dep[
                    "dep"] == "nsubj:outer"
                        or double_dep["dep"] == "csubj" or double_dep["dep"] == "csubj:pass" or double_dep[
                            "dep"] == "csubj:outer"
                        or double_dep["dep"] == "obj"):
                    subject_descriptors[2].add(double_dep["dependentGloss"])

        if (dep["dep"] == "nmod" or dep["dep"] == "nmod:npmod" or dep["dep"] == "nmod:tmod" or dep[
            "dep"] == "nmod:poss"):
            subject_descriptors[1].add(dep["dependentGloss"])
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "advmod"):
            subject_descriptors[1].add(dep["dependentGloss"])  # TODO: CHECK THIS
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n")

        if (dep["dep"] == "appos"):
            subject_descriptors[0].add(dep["dependentGloss"])  # TODO:CHECK THIS
            # Look for adjectives for appos also
            # CHECK COMPOUD AND AMOD
            to_add = follow_compound(dep["dependent"], dependencies_by_governor)
            if verbose:
                print(dep["dep"], dep["dependentGloss"], "\n", to_add)
            for thing in to_add:
                subject_descriptors[0].add(thing)
            for double_dep in dependencies_by_governor[dep["dependent"]]:
                if (double_dep["dep"] == "amod"):
                    subject_descriptors[2].add(double_dep["dependentGloss"])

        #         if (dep["dep"] == "ccomp"):
        #             print(dependencies_by_governor[dep["dependent"]])
        #             for double_dep in dependencies_by_governor[dep["dependent"]]:
        #                 if (double_dep["dep"] == "nsubj" or double_dep["dep"] == "nsubj:pass"
        #                      or double_dep["dep"] == "csubj" or double_dep["dep"] == "csubj:pass"):
        #                     subject_descriptors[2].add(double_dep["dependentGloss"])

        if (dep["dep"] == "nummod"):
            try:
                subject_descriptors[3] = locale.atoi(dep["dependentGloss"])
            except:
                subject_descriptors[3] = dep["dependentGloss"]

    return subject_descriptors

In [None]:
def extract_sentences(sentences):
    # extract all sentences in an article
    sentences_text = [None] * len(sentences)
    for sentence in sentences:
        sentence_index = sentence["index"]

        tokens = sentence["tokens"]
        sentence_text = ""
        for token in tokens:
            sentence_text += token["before"] + token["word"] + token["after"]

        sentences_text[sentence_index] = sentence_text
    return sentences_text

In [None]:
from IPython.display import clear_output
def refresh_screen():
    clear_output()
    sleep(0.02)

In [None]:
# load data for annotation
df = pd.read_csv('./data/summary_20231201_livefeeds.csv\')
df['date'] = pd.to_datetime(df['date'])

In [None]:
root_directory = './'

# list of ids we have already analysed
analysed_ids = [int(f.split(".")[0].split("_")[-1]) for f in os.listdir(root_directory + "fatality_counts/") if (f != '' and f not in ['.gitkeep','archive','summary'])]

# init to handle duplicates
assigned_sentences = {} 

to_annotate = df.shape[0]

for index, row in df.iterrows():

    _ids = []
    titles = []
    dates = []
    voices = []
    categories = []
    recorded_sentences = []
    
    results_file = row['results_file']
    article_file = row['article_file']
    _id = row['id']
    
    
    if _id in analysed_ids:
        continue

    # Open NLP-analyzed result
    filename = root_directory + results_file
    try:
        with open(filename) as d:
            print(filename)
            data = json.load(d)
    except FileNotFoundError:
        print('FILE NOT FOUND')
        continue

    # Open original text block from preprocessed data file
    original_filename = root_directory + article_file
    f = open(original_filename, "r")
    article_text = f.read()
    f.close()

    # Extract original date
    title, date = row['title'], row['date'].date().strftime('%Y-%m-%d')

    # Extract NLP results
    sentences = data["sentences"]
    text_all_sentences = extract_sentences(sentences)
    
    #for s in text_all_sentences:
    #    print(s, '\n')
    
    # Iterate through POS labels for each token
    file_count = -1
    fn_started = False
    
    for sentence in sentences:
        #print(sentence)
        sentence_index = sentence["index"]

        tokens = sentence["tokens"]
        dependencies = sentence["basicDependencies"]
        sentence_text = ""
        for token in tokens:
            sentence_text += token["before"] + token["word"] + token["after"]
            
        #print(sentence_text)

        # create a data structure that maps governor dep_idx to dependencies
        dependencies_by_governor = [[] for i in range(len(tokens) + 1)]
        for dep in dependencies:
            gov_idx = dep["governor"]
            dependencies_by_governor[gov_idx].append(dep)

        # create data structure for tokens
        tokens_by_idx = {}
        for token in tokens:
            tokens_by_idx[token["index"]] = token

        prepositional_information = None
        for token in tokens:
            pos = token["pos"]
            word = token["word"]
            lemma = token["lemma"]
            dep_idx = token["index"]
                
                
            # determine whether the sentence contains a fatal mention, whether through verbs or adjective
            if pos in VERBS:
                if lemma not in ALL_FATAL_VERBS:
                    #print("VERB BUT NOT FATAL - SKIPPING.")
                    continue
            else:
                if pos in ADJECTIVES:
                    if lemma not in FATAL_ADJECTIVES:
                        #print("ADJ BUT NOT FATAL - SKIPPING.")
                        continue
                else:
                    #print("NOT VERB OR ADJ - SKIPPING.")
                    continue


            verb_active = False if lemma in FATAL_VERBS_PASSIVE else True

            # Find subject
            voice = None
            for dep in dependencies_by_governor[dep_idx]:
                # If the sentence is in active voice, a 'nsubj' dependecy should exist.
                # If the sentence is in passive voice a 'nsubjpass' dependency should exist
                if dep["dep"] == "nsubj" or dep["dep"] == "csubj":
                    voice = "ACTIVE"
                    break
                elif dep["dep"] == "nsubj:pass" or dep["dep"] == "csubj:pass" or dep["dep"] == "aux:pass":
                    voice = "PASSIVE"
                    break

            # for ACTIVE verbs, we are looking for dobj is used actively or nsubjpass if used passively
            # for PASSIVE verbs, we are looking for nsubj if used actively or nsubjpass if used passively
            subject = None
            perp_keyword = None
            for dep in dependencies_by_governor[dep_idx]:
                if voice == "ACTIVE" and not verb_active and dep["dep"] == "nsubj":
                    # Example --> She died
                    subject = (dep["dependent"], dep["dependentGloss"])
                    break
                if voice == "PASSIVE" and dep["dep"] == "nsubj:pass":
                    # Example --> She is deceased, She was killed
                    subject = (dep["dependent"], dep["dependentGloss"])
                    #                         perp_keyword = "iobj" NEED TO CHECK THIS
                    break
                if verb_active and dep["dep"] == "obj":
                    voice = "ACTIVE"
                    # Example --> He killed her
                    subject = (dep["dependent"], dep["dependentGloss"])
                    #                         perp_keyword = "nsubj"
                    break

            # If subject found, find GUESS for subject's affiliation
            IS_MEMBER_PALESTINE = False
            IS_MEMBER_ISRAEL = False
            if subject is not None:
                subj_idx = subject[0]
                subj_token = tokens_by_idx[subj_idx]
                print("SUBJECT", subject[1], subj_token["ner"])
                subject_descriptors = investigate_subject(subject, dependencies_by_governor)
                # Check prepositional phrases for this sentence
                if prepositional_information is None:
                    prepositional_information = set()
                    for dep in dependencies:
                        if (dep["dep"] == "case"):
                            if (tokens_by_idx[dep["dependent"]]["lemma"] == "at"
                                    or tokens_by_idx[dep["dependent"]]["lemma"] == "in"):
                                prepositional_information.add(dep["governorGloss"])
                for pi in prepositional_information:
                    subject_descriptors[2].add(pi)

                # Finally, check prepositional phrases in the sentence
                # token --> case --> to preposition (in or at)
                ### ---------------TODO: MUST CHANGE HERE WHEN DATASET UPDATES
                for j in range(3):
                    for sd in subject_descriptors[j]:
                        if sd in PALESTINE_MEMBER_AFFILIATIONS:
                            IS_MEMBER_PALESTINE = True
                        if sd in ISRAEL_MEMBER_AFFILIATIONS:
                            IS_MEMBER_ISRAEL = True

            # MANUALLY VALIDATE CATEGORY
            if sentence_text not in assigned_sentences:
                print(index, '/', to_annotate)
                print('Please assign a category to the VICTIM:')
                if (IS_MEMBER_PALESTINE or IS_MEMBER_ISRAEL) and not (IS_MEMBER_PALESTINE and IS_MEMBER_ISRAEL):
                    category_guess = 'palestine' if IS_MEMBER_PALESTINE else 'israel'
                    print('My guess is the VICTIM is from', category_guess)
                assigned = False
                sentence_chunk = " ".join(
                    text_all_sentences[max(0, sentence_index - 3):min(len(sentences) - 1, sentence_index + 4)])
                for string in [sentence_text, sentence_chunk, article_text]:
                    print(string)
                    while True:
                        mapper = {'pal':'palestine', 'is':'israel'}
                        category = input(
                            "Enter category ('palestine', 'israel', 'both', none', or 'next' for the next string): ").strip().lower()
                        print()
                        if category in mapper:
                            category = mapper[category]
                        print('You chose', category)
                        if category == 'palestine' or category == 'israel' or category == 'both':
                            categories.append(category)
                            assigned = True
                            break
                        elif category == 'none':
                            assigned = True
                            break
                        elif category == 'next':
                            break
                        else:
                            print("Invalid category. Please enter 'palestine', 'israel', 'both', 'none', or 'next'.")
                    if assigned:
                        print('Categorized as', category)
                        break

                refresh_screen()
                print()
            else:
                print('ASSIGNED TO EXISTING CATEGORY')
                assigned = True
                category = assigned_sentences[sentence_text]
                categories.append(category)

            if assigned and category != 'none':
                # date_str = extract_date(index, file_count)
                # date = parse_date(date_str)
                dates.append(date)
                voices.append(voice)
                recorded_sentences.append(sentence_text)

                # metadata
                titles.append(title)
                _ids.append(_id)

                # add to dictionary to handle duplicates
                assigned_sentences[sentence_text] = category
                
    # After all sentences are complete, save info for a file
    file_dict = {
        'article_id': _ids,
        'article_title': titles,
        'article_date': dates,
        'sentence': recorded_sentences,
        'category': categories,
        'voice': voices
    }
    df = pd.DataFrame(file_dict)
    df.to_csv(root_directory + "fatality_counts/" + 'articles_' + str(_id) + '.csv', index=False)
    print('Saved CSV file, moving on to next article...', index + 1)
    print('.......')
    refresh_screen()
    print()