Class for Corpus preprocessing:

In [2]:
import string

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin

swDiff = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'}


class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.stopwords.difference_update(swDiff)
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X):
        return [
            list(self.tokenize(doc)) for doc in X
        ]

    def tokenize(self, document):
        # Break the document into sentences
        for sent in sent_tokenize(document):
            # Break the sentence into part of speech tagged tokens
            for token, tag in pos_tag(wordpunct_tokenize(sent)):
                # Apply preprocessing to the token
                token = token.lower() if self.lower else token
                token = token.strip() if self.strip else token
                token = token.strip('_') if self.strip else token
                token = token.strip('*') if self.strip else token

                # If stopword, ignore token and continue
                if token in self.stopwords:
                    continue

                # If punctuation, ignore token and continue
                if all(char in self.punct for char in token):
                    continue

                # Lemmatize the token and yield
                lemma = self.lemmatize(token, tag)
                yield lemma

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)

This part of code loads data corpus from multiple files into lists X (texts) and y(labels) with one entry per user:

In [6]:
from lxml import etree
import os
import re

cwd = os.getcwd()
POSITIVE_PATH = os.path.join(cwd, "positive_examples_anonymous")
NEGATIVE_PATH = os.path.join(cwd, "negative_examples_anonymous")
LABELS_PATH = os.path.join(cwd, 'risk_golden_truth.txt')
IMAGE_STR = 'data:image'

labels_file = open(LABELS_PATH, 'r')
label_dict = {}
for line in labels_file:
    xml_file, label = line.split(' ')
    label_dict[xml_file] = label
labels_file.close()

X = []
y = []

for listOfEntries in [os.scandir(POSITIVE_PATH), os.scandir(NEGATIVE_PATH)]:
    for entry in listOfEntries:
        root = etree.parse(entry.path).getroot()
        userId = root[0].text

        userText = ''
        for post in root.findall('.//TITLE') + root.findall('.//TEXT'):
            post = post.text.strip()
            if post != '':
                if IMAGE_STR in post:
                    continue
                post = re.sub(r"http\S+", "", post)
                userText += ' ' + post
        X.append(userText)
        y.append(int(label_dict[userId]))

We use X list as input to NLTKPreprocessor class which outputs list of preprocessed, tokenized texts:

In [None]:
preprocessor = NLTKPreprocessor()
preprocessor.transform(X)