In [56]:
import string
import os
import regex as re

from lxml import etree
from operator import itemgetter

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import SnowballStemmer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

Here are some utility functions:

In [12]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

def show_most_informative_features(vectorizer, classifier, text=None, n=20):
    """
    Accepts a Pipeline with a classifer and a TfidfVectorizer and computes
    the n most informative features of the model. If text is given, then will
    compute the most informative features for classifying that text.

    Note that this function will only work on linear models with coefs_
    """

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = classifier.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(classifier.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)

Class for Corpus preprocessing:

In [19]:
swDiff = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'}

class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.stopwords.difference_update(swDiff)
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = SnowballStemmer(language='english')

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X, method='lem'):
        return [
            list(self.tokenize(doc, method)) for doc in X
        ]   

    def tokenize(self, document, method='lem'):
        if(method == 'lem'):
            # Break the document into sentences
            for sent in sent_tokenize(document):
                # Break the sentence into part of speech tagged tokens
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    # Apply preprocessing to the token
                    token = self.process_token(token)
                    if not self.is_valid_token(token):
                        continue
                        
                    # Lemmatize the token and yield
                    lemma = self.lemmatize(token, tag)
                    yield lemma
                    
        elif(method == 'stem'):
            # Break the document into tokens
            for token in wordpunct_tokenize(document):
                # Apply preprocessing to the token
                token = self.process_token(token)
                if not self.is_valid_token(token):
                    continue
                
                stem = self.stem(token)
                yield stem
        else:
            raise ValueError('Unknown method type.')

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
    
    def stem(self, token):
        return self.stemmer.stem(token)
    
    def process_token(self, token):
        token = token.lower() if self.lower else token
        token = token.strip() if self.strip else tcharoken
        token = token.strip('_') if self.strip else token
        token = token.strip('*') if self.strip else token
        return token
    
    def is_valid_token(self, token):
        # If stopword, token is invalid
        if token in self.stopwords:
            return False

        # If punctuation, token is invalid
        if all(char in self.punct for char in token):
            return False
        
        return True

This part of code loads data corpus from multiple files into lists X (texts) and y(labels) with one entry per user:

In [20]:
cwd = os.getcwd()
POSITIVE_PATH = os.path.join(cwd, "positive_examples_anonymous")
NEGATIVE_PATH = os.path.join(cwd, "negative_examples_anonymous")
LABELS_PATH = os.path.join(cwd, 'risk_golden_truth.txt')
IMAGE_STR = 'data:image'

labels_file = open(LABELS_PATH, 'r')
label_dict = {}
for line in labels_file:
    xml_file, label = line.split(' ')
    label_dict[xml_file] = label
labels_file.close()

X_raw = []
y = []

for listOfEntries in [os.scandir(POSITIVE_PATH), os.scandir(NEGATIVE_PATH)]:
    for entry in listOfEntries:
        root = etree.parse(entry.path).getroot()
        userId = root[0].text

        userText = ''
        for post in root.findall('.//TITLE') + root.findall('.//TEXT'):
            post = post.text.strip()
            if post != '':
                if IMAGE_STR in post:
                    continue
                post = re.sub(r"http\S+", " ", post)
                post = re.sub(r"\d+", " ", post)
                post = re.sub(u"\xa0", " ", post)
                post = re.sub(u"\\p{P}+", " ", post)
                userText += ' ' + post
        X_raw.append(userText)
        y.append(int(label_dict[userId]))

We use X list as input to NLTKPreprocessor class which outputs list of preprocessed, tokenized texts:

In [97]:
preprocessor = NLTKPreprocessor()
X_prep = preprocessor.transform(X_raw, method='stem')

We use tf-idf vectorizer for vector representation of the documents:

In [98]:
vect = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2), min_df=20)
X = vect.fit_transform(X_prep, y)

In [99]:
#print(vect.get_feature_names())

Building and evaluating model:

In [149]:
print("Building for evaluation")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

model = SGDClassifier()
model.fit(X_train, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

Building for evaluation
Evaluation model fit
Classification Report:

             precision    recall  f1-score   support

          0       0.90      0.98      0.93        81
          1       0.80      0.47      0.59        17

avg / total       0.88      0.89      0.88        98





Building the complete model on whole dataset:

In [150]:
model_complete = SGDClassifier()
model_complete.fit(X, y)

print("Complete model fit")

Complete model fit




Most informative features:

In [151]:
print(show_most_informative_features(vect, model_complete))

6.4508        depress    -4.3481           look
4.7115           feel    -4.2268             we
3.6593         someon    -4.0673            car
3.6370           help    -4.0299           book
3.3272              i    -3.9982           movi
3.2043           hero    -3.3990            new
3.1567         spider    -3.2930           read
2.8434          peopl    -3.2096          write
2.8132           work    -3.1326          berni
2.8077             me    -3.1311          great
2.7300         person    -3.0551           song
2.5004          arrow    -2.8989            see
2.4717        anxieti    -2.6558        favorit
2.4227            tri    -2.4534            use
2.4058          medic    -2.4193          place
2.3212           girl    -2.4040   anti depress
2.2551            med    -2.3842            man
2.2367         friend    -2.3761         sander
2.1843         gluten    -2.2020           made
2.1814         myself    -2.1590          would
