# 1 - Data Preprocessing

This notebook takes in plain text files from Paris Review articles, applies preprocessing and exports them in a Bag-of-Words vector reprensetation for subsequent topic modelling in Notebook 2.

In [1]:
from pathlib import Path
import re
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

Settings:

In [2]:
# raw data directory
dir_data = Path("../data")
dir_raw = dir_data / "raw"
# additional input files
stoplist_file = "stopwords.txt"
lem_file = "lemmatization-en.txt"
# output paths
dir_out = dir_data / "proc"
out_prefix = "paris"

# minimum length for a term (word)
min_term_length = 2
# minimum number of documents for a term to appear
min_df = 10
# minimum document length (in characters)
min_doc_length = 80
# maximum ngram range (default is 1, i.e. unigrams only)
max_ngram = 1
# weighting and normalization settings
apply_tfidf = True
apply_norm = True

## Data Loading

Find all relevant text files, where each file corresponds to a Paris Review article:

In [3]:
file_paths = []
for f in dir_raw.glob('**/*.txt'):
    file_paths.append(f)
file_paths.sort()
print("Found %d text files to preprocess" % len(file_paths))    

Found 404 text files to preprocess


Read the input files:

In [4]:
url_pattern = re.compile('https?[:;]?/?/?\S*')

def read_text(in_path):
    body = ""
    with open(in_path, 'r', encoding="utf8", errors='ignore') as fin:
        while True:
            line = fin.readline()
            if not line:
                break
            # Remove URIs at this point
            normalized_line = re.sub(url_pattern, '', line.strip())
            if len(normalized_line) > 1:
                body += normalized_line
                body += "\n"
    return body

In [5]:
documents, document_ids = [], []
num_short_documents = 0
for in_path in file_paths:
    # create the document ID
    dirname = in_path.parent.name
    doc_id = str(in_path)
    # proess the body text
    body = read_text(in_path)
    if len(body) < min_doc_length:
        num_short_documents += 1
        continue    
    documents.append(body)
    document_ids.append(doc_id)
print("Kept %d documents. Skipped %d documents with length < %d" % 
    (len(documents), num_short_documents, min_doc_length))

Kept 404 documents. Skipped 0 documents with length < 80


Load stopwords:

In [6]:
stopwords = []
with open(stoplist_file) as f:
    lines = f.readlines()
    for l in lines:
        l = l.strip().lower()
        if len(l) > 0:
            stopwords.append(l)
print("Using %d stopwords from %s" % (len(stopwords), stoplist_file))

Using 610 stopwords from stopwords.txt


Load lemmatization dictionary and apply it to our stopwords:

In [7]:
class DictLemmatizer:
    def __init__(self, in_path):
        self.term_map = {}
        with open(in_path, 'r', encoding="utf8", errors='ignore') as fin:
            while True:
                line = fin.readline()
                if not line:
                    break
                parts = line.strip().lower().split("\t")
                term = parts[1].strip()
                stem = parts[0].strip()
                if len(term) >= min_term_length and len(stem) >= min_term_length:
                    self.term_map[term] = stem

    def apply(self, s):
        if not s in self.term_map:
            return s
        return self.term_map[s]

In [8]:
print("Loading lemmatization dictionary from %s ..." % lem_file)
lemmatizer = DictLemmatizer(lem_file)
extra_stopwords = set()
for stopword in stopwords:
    extra_stopwords.add(lemmatizer.apply(stopword))
stopwords = list(extra_stopwords)
print("Using %d stopwords after lemmatization" % len(stopwords))

Loading lemmatization dictionary from lemmatization-en.txt ...
Using 498 stopwords after lemmatization


## Bag-of-Words Preprocessing

Define our word tokenizer:

In [9]:
token_pattern = re.compile(r"\b\w\w+\b", re.U)

def custom_tokenizer(s):
    return [x.lower() for x in token_pattern.findall(s) if len(x) >= min_term_length ]

def unigram_tokenizer(s):
    tokens = custom_tokenizer(s.lower())
    if lemmatizer is None:
        return 
    lem_tokens = []
    for token in tokens:
        ltoken = lemmatizer.apply(token)
        if len(ltoken) >= min_term_length:
            lem_tokens.append(ltoken)
    return lem_tokens

Convert the documents to a vector representation

In [10]:
print("Preprocessing data (%d stopwords, tfidf=%s, normalize=%s, min_df=%d, max_ngram=%d) ..." % 
    (len(stopwords), apply_tfidf, apply_norm, min_df, max_ngram))

if apply_norm:
    norm_function = "l2"
else:
    norm_function = None

# build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", token_pattern=None,
    tokenizer=unigram_tokenizer, use_idf=apply_tfidf, norm=norm_function, 
    min_df=min_df, ngram_range=(1, max_ngram))
X = tfidf.fit_transform(documents)
terms = []
# create the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
    terms.append("")
for term in v.keys():
    terms[v[term]] = term
                        
print("Built document-term matrix: %d documents, %d terms" % (X.shape[0], X.shape[1]))

Preprocessing data (498 stopwords, tfidf=True, normalize=True, min_df=10, max_ngram=1) ...
Built document-term matrix: 404 documents, 7061 terms


Save the preprocessed data in binary format:

In [11]:
fname = "%s.pkl" % out_prefix
out_path = dir_out  / fname
dir_out.mkdir(parents=True, exist_ok=True)
print("Saving data to %s" % out_path)
joblib.dump((X,terms,document_ids), out_path) 

Saving data to ../data/proc/paris.pkl


['../data/proc/paris.pkl']