# Test preprocessing

### This notebook serves as an exercise to practice all text preprocessing steps on a given text document using 3 different libraries

### load and explore the data

In [None]:
data = ''
with open('data.txt','r') as inputfile:
    data = inputfile.read()

In [None]:
print(data)

### Data cleaning

In [None]:
# Need to clean unneeded markings
import re

data_clean = re.sub("\[.+\]","",data)#remove [NUM] tags

In [None]:
data_clean

#### We can still see unneeded new-line (\n) characters, but the tokenizer will take care of those

### Tokenization

In [None]:
from nltk.tokenize import word_tokenize

data_clean_word_tokenized = word_tokenize(data_clean)
data_clean_word_tokenized[:10]

#### Side note: in some nlp tasks, you need to preserve the sentence separation. In that case, you must first separate by sentence, and then separate these sentences into tokens

In [None]:
from nltk.tokenize import sent_tokenize

data_clean_sent_tokenized = sent_tokenize(data_clean)
data_clean_sent_tokenized[:2]

In [None]:
data_clean_word_sent_tokenized = [word_tokenize(sentence) for sentence in data_clean_sent_tokenized]
data_clean_word_sent_tokenized[0]

### remove punctuations with lowercasing

In [None]:
data_clean_word_tokenized = [word.lower() for word in data_clean_word_tokenized if word.isalpha()]
data_clean_word_tokenized[:10]

### Removing stopwords

In [None]:
from nltk.corpus import stopwords

data_clean_word_tokenized = [word for word in data_clean_word_tokenized if not word in stopwords.words('english')]
data_clean_word_tokenized[:10]

### Lemmatization / POS tagging

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import word_tokenize, pos_tag

def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    return ''
    
lemmatizer = WordNetLemmatizer()

data_clean_word_lemmatized = []

for i, word in enumerate(data_clean_word_tokenized):
    pos = get_wordnet_pos(pos_tag([word])[0][1])
    if pos != '':
        data_clean_word_lemmatized.append(lemmatizer.lemmatize(word, pos))
    else:
        data_clean_word_lemmatized.append(word)

data_clean_word_lemmatized[:10]

## Now we have a dataset of pre-processed words
