# 1.04 - Witness Feature Extraction
## **Haunted Places Witness Count**

Numbers extacted using [numberscraper](https://github.com/scrapinghub/number-parser)

**"Haunted_Places_Witness_Count" [datetime]**
- Format: int
- Default Value: 0

**NOTES**:
- Considerations:
    - Multiple witness accounts in the same entry (sum)?
    - How many is "Several"?
- Regex using pronouns
    - "I", "we", "me"
    - "Several", "some", "they"

    

In [None]:
# System Path #
import os
import sys 

# Pandas #
import pandas as pd
import time
import re

# Number Parser #
from number_parser import parse_ordinal
from number_parser import parse_number
from number_parser import parse

# Tika #
import tika as tk
from tika import parser



parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)


# Reading CSV
df = pd.read_csv("../data/processed/haunted_places_cleaned.tab", sep = "\t")

### Common Whitness Nouns

In [708]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from collections import Counter

# Download necessary resources (only need to run once)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')

def extract_nouns(text):
    # Tokenize text
    words = word_tokenize(text)
    
    # Get POS tags
    tagged_words = pos_tag(words)
    
    # Filter nouns: NN (singular), NNS (plural), NNP (proper noun, singular), NNPS (proper noun, plural)
    nouns = [word for word, tag in tagged_words if tag in ["NN", "NNS", "NNP", "NNPS"]]
    
    return nouns


noun_counts = Counter()
# Sample text
for idx in range(df.shape[0]):
    sequence = df['description'][idx].split()
    tagged_sequence = pos_tag(sequence)
    for word, tag in tagged_sequence:
        if word in (['vehicle']):
            print(" ".join(sequence))
        elif tag in ["NN", "NNS", "NNP", "NNPS"]:
            noun_counts[word.lower()] += 1
    
        
quantifer_counts = Counter()
for idx in range(df.shape[0]):
    sequence = df['description'][idx].split()
    tagged_sequence = pos_tag(sequence)
    for word, tag in tagged_sequence:
        if tag in ["DT", "PDT"]:
            quantifer_counts[word.lower()] += 1
    
noun_counts

sacred heart academy statue mary hands raised air . . see drive front yard . urban legend goes 2 young boys playing statue . supposed come home streetlights come never . parents started looking found bloody message base statue said `` n't let kids play dark '' . fingers statue cut blood coming . go today 'll see fingers cut eyes gouged . august 2005 update believed hoax . visitor site reports hands statue intact .
story goes group kids playing chicken near denton road bridge one proved chicken . car swerved road reached bridge crashed river . many people claim see light come river chase vehicle end road stop center bridge night . november 2003 update road reconfigured area bridge longer exists . say hauntings longer occur .
mary said run girls bathroom second floor crying teacher screamed . tryed climb vent get school . didnt know vent old air conditioning vent turned automatically . said climbed screamed blades cut body died vent . dark stain beneath vent door stall removed .
right wi

Counter({'night': 3578,
         'people': 3190,
         'house': 2248,
         'room': 2179,
         'man': 2143,
         'ghost': 1928,
         'road': 1855,
         'school': 1725,
         'floor': 1697,
         'woman': 1486,
         'building': 1467,
         'lights': 1397,
         'cemetery': 1345,
         'years': 1316,
         'area': 1217,
         'car': 1204,
         'time': 1144,
         'girl': 1119,
         'doors': 1076,
         'place': 1016,
         'reports': 987,
         'bridge': 952,
         'children': 940,
         'things': 924,
         'day': 895,
         'someone': 871,
         'hall': 831,
         'noises': 744,
         'ghosts': 702,
         'story': 701,
         'students': 691,
         'voices': 662,
         'footsteps': 653,
         'home': 647,
         'door': 631,
         'times': 625,
         'woods': 591,
         'sightings': 586,
         'boy': 580,
         'rooms': 575,
         'heard': 564,
         'church': 54

In [706]:
noun_counts['vehicle']

99

### Parsing First Person

test case:
- idx: 501

    told see light changes colors red white floats around one ever found unsolved mysteries could find i many time seen story older man went save two kids train pushed way got decapitated wonders woods looking head holding lantern
    - goal : 1
- idx: 761

    said area still used kkk bridge said people killed bridge put keys bridge 5 mins car start road bad tho really hilly also winter maintance alarmed we recentaly walkedup hill looks like path said top meeting ground we walking path i turned light camera woods we seen white figure we stick around long enough investigate october 2004 correction family died bridge early setlers area late 1700 's county began settled reported mother frantically looking childen underbrush hear rustling bushes plaintive call distance reported carries glowing taper lantern
    - goal : 2
- idx: 876

    strange noises figures seen i seen heard doors slam shut lights flicker early hours morning rooms become chillingly cold also enter building get feeling followed even though one around
    - goal : 1


In [None]:
import json
from itertools import chain
from dsci_550_a1.parsingFunctions import extractSequences

# Verb Dictionary
with open("../data/keywords/witness_verbs.json", "r") as file:
    witness_verbs = json.load(file)


## First Person Regex Check

def first_person_regex_check(text):

    ## Extract sequences
    tokens = text.split()
    sequences = extractSequences(tokens, '.')

    ## Regex checks for "we" and "i"
    pattern = r"\bi\b|\bwe\b" 

    ## Init verb set ##
    verb_set = set(chain(*witness_verbs.values())) 

    ## Iterate through sequences
    for sequence in sequences:
        ## Iterate though tokens in sequence
        for idx, token in enumerate(sequence):
            
            # If "i" or "we" found
            if re.search(pattern, token, re.IGNORECASE):
                try:
                    # Check rest of sentence for overlap 
                    if any(word in verb_set for word in sequence[idx:]):

                        # First person Plural
                        if token == "we":
                            return 2
                        
                        # First person singular
                        elif token == 'i':
                            return 1


                except IndexError:
                    pass
    return 0


test_cases = [(501, 1), (761, 2), (876,1)]


for index,target in test_cases:
    description = df['description'][index]
    extracted_witnesses = first_person_regex_check(description)
    print("-"*50, f"Test_Case: {index}", description, f"Target Witnesses: {target} | Extracted Witnesses: {extracted_witnesses}", "-"*50, sep = "\n")



df['Haunted_Places_Witness_Count'] = df['description'].apply(first_person_regex_check)
df['Haunted_Places_Witness_Count'].value_counts()

### Parsing Ambiguous Quantifiers
- idx: 1012

    rumored ghost edgar allan poe still exists eutaw house restaurant historic inn/restaurant central pa history hauntings researching ghost stories pa article mother i went talk  haunting  shortly we left horse bells dining area door began ring **several people reported** feeling `` unusually uncomfortable watched '' went upstairs restrooms
    - goal : 3


In [None]:

quantifiers = {
    "2" : ['pair', 'couple'], 
    "3" : ['many', 'several', 'some', 'few', 'group', 'groups']
}
quantifiers = {word: number for number, words in quantifiers.items() for word in words}

def parse_ambiguous(text: str) -> str:
    '''
    Parses Ambiguous quantifiers like "several" and "many" and replaces them with numbers
    Input:
        [text]  - raw text
    Returns:
        [text]  - text with quantifiers replaced with numbers
    '''
    # Parse Quantifiers #
    tokens = text.split()
    tokens = [quantifiers.get(token, token) for token in tokens]
    return " ".join(tokens)


parse_ambiguous(df['description'][1012])




### Eyewitness Extraction (no sliding window)

In [None]:
witness_nouns = json.load(open("../data/keywords/witness_nouns.json", "r"))
witness_verbs = json.load(open("../data/keywords/witness_verbs.json", "r"))

def extract_eyewitness_counts(text: str) -> int:
    '''
    Extract number of witnesses from a block of text.

    Steps:
    1. Tokenize and sequence text
    2. Identify Witness-Specific Nouns
    3. Identify Witness-Specific Verbs
    4. Check for previous quantifier
    5. +1 for singular, +3 for plural, or +quantifier

    Input:
        [text]            - raw text with quantifiers converted to digits
    Returns:
        [witness_counts]  - Number of witnesses

    eg: 
    >>> extract_eyewitness_counts("2 girls names elizabeth evelyn felt legs pulled sitting bleachers.")
    2
    '''
    ## Extract Tokens ##
    tokens = text.split()

    ## Extract Sequences ##
    sequences = extractSequences(tokens, '.')

    ## Noun Regex Patterns. See witness_nouns.josn ##
    singular_noun_pattern = re.compile(r"\b(" + "|".join(map(re.escape, list(chain(*witness_nouns['Singular'].values())))) + r")\b", re.IGNORECASE)
    plural_noun_pattern = re.compile(r"\b(" + "|".join(map(re.escape, list(chain(*witness_nouns['Plural'].values())))) + r")\b", re.IGNORECASE)
    
    ## Singular Nouns 
    regex_dict = {
        '1' : singular_noun_pattern,
        '3' : plural_noun_pattern
    }
    ## Verb Set ##
    verb_set = set(chain(*witness_verbs.values())) 

    ## Initialize Witness Count ##
    witness_count = 0 

    for sequence in sequences:

        ## Check Singular and Plural Patterns ##
        for val, regex_pattern, in regex_dict.items():
        
            default_value = int(val)

            ## Iterate through each sentence ##
            for idx, token in enumerate(sequence):

                # If noun match found 
                if regex_pattern.match(token):

                    # Check rest of sequence for verb 
                    if any(word in verb_set for word in sequence[idx:]):

                        ## Check for previous quantifier ## 
                        prev_token = sequence[idx - 1] if idx > 0 else None

                        if prev_token and prev_token.isdigit():
                            witness_count += int(prev_token)
                            print(token)
                        else:
                            print(token)
                            witness_count += default_value
                    
        
    return witness_count
                    


### Eyewitness Extraction (with sliding window)

In [None]:
witness_nouns = json.load(open("../data/keywords/witness_nouns.json", "r"))
witness_verbs = json.load(open("../data/keywords/witness_verbs.json", "r"))

def extract_eyewitness_counts_slide(text: str) -> int:
    '''
    Extract number of witnesses from a block of text using sliding window.

    Steps:
    1. Tokenize and sequence text
    2. initialize sentence window
    2. Identify Witness-Specific Nouns
    3. If noun found, check next 3 sentences for witness-specific verbs
    4. If verb found, check previous token for quantifier
    5. Increment witness_count, +1 for singular, +3 for plural, or +quantifier
    6. Move sliding window to sentence following witness-verb
    7. loop until final sentence

    Input:
        [text]            - raw text with quantifiers converted to digits
    Returns:
        [witness_counts]  - Number of witnesses
        [witnesses]       - token flagged as witness

    eg: 
    >>> extract_eyewitness_counts("2 girls names elizabeth evelyn felt legs pulled sitting bleachers.")
    (2, [['girls', 2]])
    '''
    
    ## Extract Tokens ##
    tokens = text.replace('.', ' . ').split()

    ## Extract Sequences ##
    sequences = extractSequences(tokens, '.')

    ## Noun Regex Patterns. See witness_nouns.josn ##
    singular_noun_pattern = re.compile(r"\b(" + "|".join(map(re.escape, list(chain(*witness_nouns['Singular'].values())))) + r")\b", re.IGNORECASE)
    plural_noun_pattern = re.compile(r"\b(" + "|".join(map(re.escape, list(chain(*witness_nouns['Plural'].values())))) + r")\b", re.IGNORECASE)
    
    ## Singular Nouns 
    regex_dict = {
        '1' : singular_noun_pattern,
        '2' : plural_noun_pattern
    }
    ## Verb Set ##
    verb_set = set(chain(*witness_verbs.values())) 

    ## Initialize Witness Count and witness list##
    witnesses = []
    witness_count = 0 
    num_sequences = len(sequences)

    i = 0 
    while i < num_sequences:
        starting_sequence = sequences[i]

        ## Check First Person Regex ##
        first_person_witnesses = first_person_regex_check(" ".join(starting_sequence))
        if first_person_witnesses != 0:
            witness_count += first_person_witnesses
            witnesses.append(['i|we', '1|2'])
            i += 1
            break

        
        ## Check Singular and Plural Patterns ##
        for val, regex_pattern, in regex_dict.items():
            
            default_value = int(val)

            ## Iterate through each sentence ##
            for idx, token in enumerate(starting_sequence):
                

                # If noun match found 
                if regex_pattern.match(token):

                    ## Search for verb within 3 sentence window
                    for j in range(i, min(i + 2, num_sequences)):
                        ending_sequence = sequences[j] 

                        # If verb found, check starting sentence for quantifier
                        if any(word in verb_set for word in ending_sequence):

                            prev_token = starting_sequence[idx -1] if idx > 0 else None

                            # If digit, add value and set sliding window to start at next sentence. 
                            # Filter quantifiers over 15
                            if prev_token and prev_token.isdigit() and int(prev_token) < 16:
                                witness_count += int(prev_token)
                                i = min(j+1, num_sequences)
                                witnesses.append([token, int(prev_token)])
                                break

                            # If no quantifier, add default_value and set sliding window to start at next sentence
                            else:
                                witness_count += default_value
                                i = min(j+1, num_sequences)
                                witnesses.append([token,default_value])
                                break

                    ## Break loop once witness is added or no verb match found in 3 sentences. ##
                    ## Begin loop from new starting_sentence ##
                    break
            
                
        ## Start at next sentence if loop fails ##
        i += 1
    return witness_count, witnesses

### Test Cases

In [None]:
test_cases = [[1176, 16], [1185,3], [1237,1], [1412,2], [1426, 0], [1428, 0], [501, 1], [761, 2], [876,1]]

for index,target in test_cases:
    description = df['description'][index]
    description = parse(parse_ambiguous(description))
    extracted_witnesses, witnesses = extract_eyewitness_counts_slide(description)
    print("-"*50, f"Test_Case: {index}", description, f"Target Witnesses: {target} | Extracted Witnesses: {extracted_witnesses}", f"flagged tokens: {witnesses}", "-"*50, sep = "\n")


### Extraction

#### Parse Quantifiers 
    

In [None]:
from tqdm import tqdm
tqdm.pandas()

# Apply the first method and save as a variable
t1 = time.time()
parsed_decriptions = df["description"].progress_apply(parse)
t2 = time.time()
# Apply the second method on the intermediate result
parsed_descriptions = parsed_decriptions.progress_apply(parse_ambiguous)
t3 = time.time()

print("-" * 150, "Quantifiers Parsed", "-" * 150)
print(f"'number scraper' runtime: {t2 - t1:.6f} seconds", end = "\n")
print(f"'parse_ambiguous' runtime: {t3 - t2:.6f} seconds", end = "\n\n")
print("-" * 150)

In [None]:
start = time.time()
df["Haunted_Places_Witness_Count"] = [
    extract_eyewitness_counts_slide((entry))
    for entry in tqdm(parsed_decriptions, desc = "Processing Entries")
]
end = time.time()


#### Unpack Outputs

In [None]:
# Witness Names
df["Haunted_Places_Witnesses"] = df["Haunted_Places_Witness_Count"].apply(lambda x: x[1])
# Witness Counts
df["Haunted_Places_Witness_Count"] = df["Haunted_Places_Witness_Count"].apply(lambda x: x[0])

In [None]:
witness_name_counter = Counter()
for entry in df["Haunted_Places_Witnesses"]:
    for witness, _ in entry:
        witness_name_counter[witness] += 1


In [None]:
print("-" * 150, "Extraction Completed", "-" * 150)
print(f"Extraction Took: {end - start:.6f} seconds", end = "\n\n")
print("Value Counts", df["Haunted_Places_Witness_Count"].value_counts(), sep = "\n")
print(f"approximate coverage: {(df['Haunted_Places_Witness_Count'] != 0).sum() / df.shape[0]}")
print(f"")
print("-" * 150)
print("Haunted Places Witnesses", "-" * 150, sep = "\n")
print("10 Most Common Witness Names:", witness_name_counter.most_common(10), sep = "\n")
print("10 Least Common Witness Names:", witness_name_counter.most_common()[-11:-1], sep = "\n")
print("-" * 150)