Data preprocessing
===

---

In [57]:
import pandas as pd
import os
import re
import inflect
from nltk import ConcordanceIndex
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# import nltk # TODO really needed?


TEXT_PATH = "./data/text"


# lemma = WordNetLemmatizer() # TODO really needed?
# p = inflect.engine() # TODO really needed?

# Import stopwords

Text analysis is more accurate when we remove the most common words from the list of tokens. 
Hypercommon words are called stop words, and there is a list of stopwords for every language. Here we import the stopwords from nltk module.

In [58]:
from nltk.corpus import stopwords

# nltk.download('stopwords')

stop_words = stopwords.words("english")
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

# Collect texts from files and create a list for dataset

In [59]:
from glob import glob


def create_dataframe_from_path(path):
    """
    Searches the entire directory structure for textfiles
    and stores the content in a pandas dataframe.
    """

    data = []

    filepaths = glob(path + "/**/*.txt", recursive=True)

    for filepath in filepaths:

        # list where we will store the relevant data for this text
        text_data = []

        filename = os.path.basename(filepath)
        author = filename.split("-")[0]
        year = filename.split("-")[1]
        title = filename.split("-")[2]

        text = open(filepath, encoding="utf-8").read()
        file_data = {
            "file": filename,
            "author": author,
            "year": year,
            "title": title,
            "text": text,
        }

        data.append(file_data)

    return pd.DataFrame(data)


df = create_dataframe_from_path(TEXT_PATH)

df.head()

Unnamed: 0,file,author,year,title,text
0,CareyGeorge Saville-1770-Analects in verse an2...,CareyGeorge Saville,1770,Analects in verse an22.txt,\n\nA Room in Brainly's House.\n\n\n\n\n\n\n\n...
1,CentlivreSusanna-1707-The platonick lady 26.txt,CentlivreSusanna,1707,The platonick lady 26.txt,"\n\n\n\nAS I was saying, Sir, I have advanc'd ..."
2,KingThomas-1769-Wit's last stake A 19.txt,KingThomas,1769,Wit's last stake A 19.txt,"MARTIN and LUCETTA meeting.\n\n\n\nMR. Martin,..."
3,PhilipsWilliam-1700-St StephensGreen 213.txt,PhilipsWilliam,1700,St StephensGreen 213.txt,"\n\n\n\n\nWELL, well, Aemilia, You may pre∣ten..."
4,Middleton Thomas-1606-The Revenger's Tragedy.txt,Middleton Thomas,1606,The Revenger's Tragedy.txt,duke royal lecher go grey haired adultery \nan...


# Functions to clean the text and create sentences


In [65]:
def clean_lowercase(input):
    """
    This function takes a string with both lower and upper case elements and returns only lower case elements
    """
    output = str(input).lower()
    return output


def text_to_sentences(input):
    """
    divide text into sentences
    """
    paragraph = re.sub("—", "", input)

    sentences = re.split("[.:,!?\n]", paragraph)
    return sentences


def clean_divide(text):
    text = clean_lowercase(text)
    sentences = text_to_sentences(text)

    return sentences

## Apply the functions to clean the text and divide it in sentences

In [66]:
df2 = df.copy()
df2["text"] = df2["text"].apply(clean_divide)
df2

Unnamed: 0,file,author,year,title,text
0,CareyGeorge Saville-1770-Analects in verse an2...,CareyGeorge Saville,1770,Analects in verse an22.txt,"[, , a room in brainly's house, , , , , , , , ..."
1,CentlivreSusanna-1707-The platonick lady 26.txt,CentlivreSusanna,1707,The platonick lady 26.txt,"[, , , , as i was saying, sir, i have advanc..."
2,KingThomas-1769-Wit's last stake A 19.txt,KingThomas,1769,Wit's last stake A 19.txt,"[martin and lucetta meeting, , , , , mr, mart..."
3,PhilipsWilliam-1700-St StephensGreen 213.txt,PhilipsWilliam,1700,St StephensGreen 213.txt,"[, , , , , well, well, aemilia, you may pre..."
4,Middleton Thomas-1606-The Revenger's Tragedy.txt,Middleton Thomas,1606,The Revenger's Tragedy.txt,"[duke royal lecher go grey haired adultery , a..."
...,...,...,...,...,...
927,MountfortWilliam-1688-The injur'd lovers 188.txt,MountfortWilliam,1688,The injur'd lovers 188.txt,"[, , discovers the king lying on a couch; afte..."
928,PhilipsWilliam-1698-The revengeful queen214.txt,PhilipsWilliam,1698,The revengeful queen214.txt,"[, , , , , how has longinus dar'd to offer thi..."
929,WilliamShakespeare-1593-Titus Andronicus.txt,WilliamShakespeare,1593,Titus Andronicus.txt,"[, , , , andronicus and senators aloft, and t..."
930,ShadwellThomas-1668-The sullen lovers o280.txt,ShadwellThomas,1668,The sullen lovers o280.txt,"[, , , in what unlucky minute was i born, , to..."


# Another set of functions

In [68]:
def tag_pos(text):
    """
    This function tags words according to position of speech.
    """

    ## Eliminate extra spaces
    temp_list = word_tokenize(text)
    # print(temp_list)
    paragraph2 = " ".join(temp_list)
    # print(paragraph2)
    nlp_sentence = tagger.tag_text(paragraph2)
    # print(nlp_sentence)
    return nlp_sentence


def part_of_speech(input1):
    global apply_counter1
    tagged_sentences = []
    for sentence in input1:
        # print(sentence)
        sentence2 = tag_pos(sentence)
        # print(sentence2)
        tagged_sentences += sentence2
    apply_counter1 += 1
    print(apply_counter1)
    return tagged_sentences

# Open tagger


<div class="alert alert-warning">Clarify how tree-tagger should be downloaded for the 3 main OS.</div>

In [None]:
from sys import platform
import treetaggerwrapper as ttpw


if platform == "darwin":
    tag_dir = "./osfstorage-archive/Pipeline2.0 Frequencies/tree-tagger-MacOSX-3.2.3"
elif platform == "win":
    tag_dir = "./osfstorage-archive/Pipeline2.0 Frequencies/tree-tagger-windows-3.2.2/TreeTagger"
elif platform == "linux":
    raise Exception("No tree-tragger available for platform " + platform)
else:
    raise Exception("Unknown platform:" + platform)

tagger = ttpw.TreeTagger(
    TAGLANG="en",
    TAGDIR=tag_dir,
)

# Tag the words in a sentence, relative to their POS

In [None]:
df3 = df2.copy()

apply_counter1 = 0

df3['text'] = df3['text'].apply(part_of_speech)

df3.to_csv('1.1_output_preprocessed.csv', encoding='utf-8-sig')

df3.head()

## Function to filter words

In [None]:
def filter_words(sentence):
    list1 = ["love", "lover", "loving", "beloved"]
    list2 = ["N", "J"]

    filtered_list = []
    global apply_counter2
    for item in sentence:
        temp_list = item.split("\t")

        lemma = temp_list[2]
        pos = temp_list[1][:1]
        # print(lemma, pos)

        if (lemma not in stop_words) and (len(lemma) > 2):
            if (pos in list2) or (lemma in list1):
                filtered_list += [lemma]
                # print(lemma, pos)
    apply_counter2 += 1

    print(apply_counter2)

    return filtered_list


# apply function
df4 = df3.copy()
apply_counter2 = 0
df4['text'] = df4['text'].apply(filter_words)

## Clean filtered dataset

In [None]:
df5 = df4.copy()
df5.head()

## Save to disk

as `1.2_output_filtered.csv`

In [24]:
df5.to_csv('1.2_output_filtered.csv', encoding='utf-8-sig')