# **Data PreProcessing**
Preprocessing data before extracting haunted features. 

Descriptions Parsed using [tika](https://github.com/chrismattmann/tika-python) and [org.apache.tika.parser.DefaultParser]



## **Clean Stop Words and Punctuation**
We found that datefinder and number parser work best when strings are cleaned of stopwords.

1. Stopwords cleaned using [natural_language_toolkit_python](https://www.nltk.org/)
    - Stopword Set: nltk.corpus.stopwords
2. Punctuations removed using python [string_module](https://docs.python.org/3/library/string.html)
    - Punctuation Set: {!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~.}


In [None]:
import tika as tk
from tika import parser
import pandas as pd

## Extract Text from File ##

def extract_text(file):
    # Parse the CSV file
    parsed_data = tk.parser.from_file(file)

    # Extract Text 
    csv_text = parsed_data["content"]
    # Print Metadata
    print(parsed_data["metadata"])

    return csv_text

extracted_text = extract_text("../data/raw/haunted_places.tab")
extracted_text_list = extracted_text.split("\t")

## Read DataFrame Using Pandas to Get Column Index ##
df = pd.read_csv("../data/raw/haunted_places.tab", delimiter = "\t") 
headers = df.columns.tolist()
column_idx = headers.index("description") + 1
num_rows = df.shape[0]

## Extract Descriptions from List, Set all Chars to Lowercase ##
extracted_descriptions = [extracted_text_list[column_idx + i * len(headers)] for i in range(1,num_rows + 1)]
extracted_descriptions = list(map(lambda x : x.lower().strip(), extracted_descriptions))

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

## Download StopWords and Tokenizer ##
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words("english"))
# Keep "I" and "We" for witness feature
stop_words.discard('i')
stop_words.discard('we')

## init Clean Descriptions##
cleaned_descriptions = []
df_out = df.copy()

for i in range(len(extracted_descriptions)):
    description = extracted_descriptions[i]
    
    ## Only Add Non-Stopwords and Non-Punctuation Tokens ##
    description = " ".join([word.lower().strip() for word in word_tokenize(description) if (word not in stop_words) and (word not in string.punctuation.replace('.',''))])

    ## Write Cleaned Description To df_out ##
    df_out.iloc[i, column_idx - 1] = description


## Fill NAN Values ##
df_out["description"] = df_out["description"].fillna("Invalid_Description").astype(str)

# Removing Invalid Descriptions

df_out.drop(1063, inplace = True)
# df_out.to_csv("../data/processed/haunted_places_cleaned.tab", index = False, header = True, sep = "\t")

In [None]:

cleaned_text = " ".join([word.lower().strip() for word in word_tokenize(extracted_descriptions[1].replace('.', '. ')) if (word not in stop_words) and (word not in string.punctuation.replace('.',''))])
tokens = cleaned_text.split()

sequences = []
current_seq = []
idx = 0 
while idx < len(tokens):
    if tokens[idx] == '.':
        current_seq.append(tokens[idx])
        sequences.append(current_seq) 
        current_seq = []
        idx += 1
    else:
        current_seq.append(tokens[idx])
        idx += 1



In [67]:
def extractSequences(tokens : list[str], sepChar: str) -> list[list[str]]:
    '''
    Takes plain text and returns groups of tokens separated by "sep"
    Input:
        [tokens]    - List of tokens
        [sepChar]   - Character that separates sequences
    Returns:
        Sequences   - list of sentence broken into tokens
    '''
    Sequences = []
    currentSequence = []

    for token in tokens:
        # Check for Punctuation #
        if token == '.':
        # Append Sentence to Res and Reset CurrentSequence #
            currentSequence.append(token)
            Sequences.append(currentSequence)
            currentSequence = []
        else:
            currentSequence.append(token)

    return Sequences
sequences = extractSequences(tokens, '.')
sequences

[['little',
  'girl',
  'killed',
  'suddenly',
  'waiting',
  'school',
  'bus',
  'speeding',
  'car',
  '.'],
 ['since', 'death', 'things', 'happened', '.'],
 ['day', 'funeral', 'started', 'knocking', 'doors', 'house', 'born', '.'],
 ['1',
  'month',
  'later',
  'clothes',
  'removed',
  'doors',
  'home',
  'flew',
  'open',
  'reason',
  '.'],
 ['cold',
  'spots',
  'felt',
  'knocking',
  'door',
  'still',
  'present',
  'day',
  'home',
  '.'],
 ['also',
  'swings',
  'yard',
  'could',
  'see',
  'swing',
  'real',
  'high',
  'someone',
  'swinging',
  '.'],
 ['lights',
  'like',
  'go',
  'home',
  'move',
  'things',
  'let',
  'someone',
  'know',
  '.'],
 ['cemetery', 'child', 'lies', 'alot', 'activity', 'also', '.'],
 ['visit',
  'grave',
  'twirly',
  'start',
  'going',
  'around',
  'round',
  'reason',
  'wind',
  'blowing',
  'let',
  'know',
  'present',
  '.'],
 ['said',
  'spotted',
  'walking',
  'cemetery',
  'restless',
  "n't",
  'realize',
  'passed',
  '.'