# 1.04 - Witness Feature Extraction
## **Haunted Places Witness Count**

Numbers extacted using [numberscraper](https://github.com/scrapinghub/number-parser)

**"Haunted_Places_Witness_Count" [datetime]**
- Format: int
- Default Value: 0

**NOTES**:
- Considerations:
    - Multiple witness accounts in the same entry (sum)?
    - How many is "Several"?
- Regex using pronouns
    - "I", "we", "me"
    - "Several", "some", "they"

    

In [None]:
# System Path #
import os
import sys 

# Pandas #
import pandas as pd
import time
import re

# Number Parser #
from number_parser import parse_ordinal
from number_parser import parse_number
from number_parser import parse

# Tika #
import tika as tk
from tika import parser



parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(parent_dir)


# Reading CSV
df = pd.read_csv("../data/processed/haunted_places_cleaned.tab", sep = "\t")

In [None]:
headers = df.columns.tolist()
num_rows = df.shape[0]
column_number = headers.index("description") + 1

# Test cases are (entry_number, expected_output)
test_cases = [[1177, 16], [1186,3], [1238,1], [1413,3], [1427, 0], [1428, 0]]

In [None]:
df['description'][561]

### Common Whitness Nouns

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from collections import Counter

# Download necessary resources (only need to run once)
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('averaged_perceptron_tagger_eng')

def extract_nouns(text):
    # Tokenize text
    words = word_tokenize(text)
    
    # Get POS tags
    tagged_words = pos_tag(words)
    
    # Filter nouns: NN (singular), NNS (plural), NNP (proper noun, singular), NNPS (proper noun, plural)
    nouns = [word for word, tag in tagged_words if tag in ["NN", "NNS", "NNP", "NNPS"]]
    
    return nouns


noun_counts = Counter()
# Sample text
for idx in range(df.shape[0]):
    sequence = df['description'][idx].split()
    tagged_sequence = pos_tag(sequence)
    for word, tag in tagged_sequence:
        if tag in ["NN", "NNS", "NNP", "NNPS"]:
            noun_counts[word.lower()] += 1
    
        
quantifer_counts = Counter()
for idx in range(df.shape[0]):
    sequence = df['description'][idx].split()
    tagged_sequence = pos_tag(sequence)
    for word, tag in tagged_sequence:
        if tag in ["DT", "PDT"]:
            quantifer_counts[word.lower()] += 1
    
noun_counts

### Parsing First Person

test case:
- idx: 501

    told see light changes colors red white floats around one ever found unsolved mysteries could find i many time seen story older man went save two kids train pushed way got decapitated wonders woods looking head holding lantern
    - goal : 1
- idx: 761

    said area still used kkk bridge said people killed bridge put keys bridge 5 mins car start road bad tho really hilly also winter maintance alarmed we recentaly walkedup hill looks like path said top meeting ground we walking path i turned light camera woods we seen white figure we stick around long enough investigate october 2004 correction family died bridge early setlers area late 1700 's county began settled reported mother frantically looking childen underbrush hear rustling bushes plaintive call distance reported carries glowing taper lantern
    - goal : 2
- idx: 876

    strange noises figures seen i seen heard doors slam shut lights flicker early hours morning rooms become chillingly cold also enter building get feeling followed even though one around
    - goal : 1


In [None]:
import json
from itertools import chain
# Verb Dictionary
with open("../data/keywords/witness_verbs.json", "r") as file:
    witness_verbs = json.load(file)


## First Person Regex Check

def first_person_regex_check(text):
    tokens = text.split()
    # Checks for "we" and "i"
    pattern = r"\bi\b|\bwe\b|\bme\b" 
    for idx, token in enumerate(tokens):
        if re.search(pattern, token, re.IGNORECASE):
            try:
                # Check Verb Overlap #
                overlap = set(tokens[idx:idx+5]).intersection(chain(*witness_verbs.values())) 
                if overlap != {}:
                    # First Person Singular
                    if token == "we":
                        return 3
                    # First Person Plural
                    else:
                        return 1
            # Handles IndexError
            except IndexError:
                pass
    return 0

df['Haunted_Places_Witness_Count'] = df['description'].apply(first_person_regex_check)
df['Haunted_Places_Witness_Count'].value_counts()

Haunted_Places_Witness_Count
0    10550
1      268
3      173
Name: count, dtype: int64

### How Many is "Several?"
- idx: 1012

    rumored ghost edgar allan poe still exists eutaw house restaurant historic inn/restaurant central pa history hauntings researching ghost stories pa article mother i went talk  haunting  shortly we left horse bells dining area door began ring **several people reported** feeling `` unusually uncomfortable watched '' went upstairs restrooms
    - goal : 3


In [None]:

quantifiers = {
    "2" : ['pair', 'couple'], 
    "3" : ['many', 'several', 'some', 'few', 'group', 'groups']
}

quantifiers_regex = {key : re.compile(r"\b(" + "|".join(map(re.escape, value)) + r")\b", re.IGNORECASE) for key, value in quantifiers.items()}

def parse_numbers(text):

    # Parse Quantifiers #
    tokens = text.split()
    for idx, token in enumerate(tokens):
        for number, regex in quantifiers_regex.items():
            if regex.search(token):
                tokens[idx] = number
    # Number Parser #
    tokens = [parse(token) for token in tokens]

    # Text With Parsed Numbers#
    return " ".join(tokens)

parse_numbers(df['description'][1012])




"rumored ghost edgar allan poe still exists eutaw house restaurant historic inn/restaurant central pa history hauntings researching ghost stories pa article mother i went talk `` haunting '' shortly we left horse bells dining area door began ring 3 people reported feeling `` unusually uncomfortable watched '' went upstairs restrooms"

### Noun Parser

In [None]:
witness_nouns = json.load(open("../data/keywords/witness_nouns.json", "r"))

# parse nouns

# parse numbers

# parse singular pronouns
test_cases = [[1177, 16], [1186,3], [1238,1], [1413,3], [1427, 0], [1428, 0]]

def extract_eyewitness_counts(text):
    tokens = text.split()
    
    noun_pattern = re.compile(r"\b(" + "|".join(map(re.escape, list(chain(*witness_nouns.values())))) + r")\b", re.IGNORECASE)
    for idx, token in enumerate(tokens):
        if noun_pattern.search(token):

            verb_flag = set(tokens[idx:idx+5]).intersection(chain(*witness_verbs.values()))
            if verb_flag != set():
                
                if tokens[idx - 1].isdigit():
                    return int(tokens[idx -1])
                else:
                    if token.endswith('s'):
                        return 2
                    else:
                        print(token)
                        return 1
                        


test_str =  parse_numbers(df['description'][1412])     
extract_eyewitness_counts(test_str)


In [None]:
from dsci_550_a1.parsingFunctions import extractSequences

'/Users/dgottschalk/Desktop/SP_25/DSCI_550/Assignments/dsci_550_a1/notebooks'

In [None]:
def extract_eyewitness(text):
    return parse(text)