In [40]:
import string
import os
import regex as re
import numpy as np

from lxml import etree
from operator import itemgetter

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import SnowballStemmer
from nltk import sent_tokenize
from nltk import pos_tag

from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import VotingClassifier

from scipy.sparse import vstack, hstack, coo_matrix

from empath import Empath

Here are some utility functions:

In [2]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

def show_most_informative_features(vectorizer, classifier, text=None, n=20):
    """
    Accepts a Pipeline with a classifer and a TfidfVectorizer and computes
    the n most informative features of the model. If text is given, then will
    compute the most informative features for classifying that text.

    Note that this function will only work on linear models with coefs_
    """

    # Check to make sure that we can perform this computation
    if not hasattr(classifier, 'coef_'):
        raise TypeError(
            "Cannot compute most informative features on {} model.".format(
                classifier.__class__.__name__
            )
        )

    if text is not None:
        # Compute the coefficients for the text
        tvec = classifier.transform([text]).toarray()
    else:
        # Otherwise simply use the coefficients
        tvec = classifier.coef_

    # Zip the feature names with the coefs and sort
    coefs = sorted(
        zip(tvec[0], vectorizer.get_feature_names()),
        key=itemgetter(0), reverse=True
    )

    topn  = zip(coefs[:n], coefs[:-(n+1):-1])

    # Create the output string to return
    output = []

    # If text, add the predicted value to the output.
    if text is not None:
        output.append("\"{}\"".format(text))
        output.append("Classified as: {}".format(classifier.predict([text])))
        output.append("")

    # Create two columns with most negative and most positive features.
    for (cp, fnp), (cn, fnn) in topn:
        output.append(
            "{:0.4f}{: >15}    {:0.4f}{: >15}".format(cp, fnp, cn, fnn)
        )

    return "\n".join(output)

Class for Corpus preprocessing:

In [3]:
sw_diff = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves'}

class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.stopwords.difference_update(sw_diff)
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = SnowballStemmer(language='english')

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X, method='lem'):
        return [
            list(self.tokenize(doc, method)) for doc in X
        ]   

    def tokenize(self, document, method='lem'):
        if(method == 'lem'):
            # Break the document into sentences
            for sent in sent_tokenize(document):
                # Break the sentence into part of speech tagged tokens
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    # Apply preprocessing to the token
                    token = self.process_token(token)
                    if not self.is_valid_token(token):
                        continue
                        
                    # Lemmatize the token and yield
                    lemma = self.lemmatize(token, tag)
                    yield lemma
                    
        elif(method == 'stem'):
            # Break the document into tokens
            for token in wordpunct_tokenize(document):
                # Apply preprocessing to the token
                token = self.process_token(token)
                if not self.is_valid_token(token):
                    continue
                
                stem = self.stem(token)
                yield stem
        else:
            raise ValueError('Unknown method type.')

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
    
    def stem(self, token):
        return self.stemmer.stem(token)
    
    def process_token(self, token):
        token = token.lower() if self.lower else token
        token = token.strip() if self.strip else tcharoken
        token = token.strip('_') if self.strip else token
        token = token.strip('*') if self.strip else token
        return token
    
    def is_valid_token(self, token):
        # If stopword, token is invalid
        if token in self.stopwords:
            return False

        # If punctuation, token is invalid
        if all(char in self.punct for char in token):
            return False
        
        return True

This part of code loads data corpus from multiple files into lists X (texts) and y(labels) with one entry per user:

In [4]:
def read_entries(X, y, path_list, label_dict=None, default_label=0):
    entry_lists = []
    for path in path_list:
        entry_lists.append(os.scandir(path))
    
    IMAGE_STR = 'data:image'
    
    for list_of_entries in entry_lists:
        for entry in list_of_entries:
            root = etree.parse(entry.path).getroot()
            user_id = root[0].text
        
            user_text = ''
            for post in root.findall('.//TITLE') + root.findall('.//TEXT'):
                post = post.text.strip().strip()
                if post != '':
                    if IMAGE_STR in post:
                        continue
                    post = re.sub(r"http\S+", " ", post)
                    post = re.sub(r"\d+", " ", post)
                    post = re.sub(u"\xa0", " ", post)
                    post = re.sub(u"\\p{P}+", " ", post)
                    user_text += ' ' + post
            
            X.append(user_text)
            label = int(label_dict[user_id]) if label_dict else default_label
            y.append(label)

Utility methods for extracting features:

In [5]:
def get_avg_sentence_length(sentences):
    sum = 0
    for sentence in sentences:
        sentence = sentence.replace(' ', '')
        sum += len(sentence)
    return sum / len(sentences) if sentences else 0

In [94]:
def read_features(X_times, X_sentence_lengths, X_post_cnt, path_list):
    entry_lists = []
    for path in path_list:
        entry_lists.append(os.scandir(path))
        
    IMAGE_STR = 'data:image'
    
    for list_of_entries in entry_lists:
        for entry in list_of_entries:
            root = etree.parse(entry.path).getroot()
            user_id = root[0].text
            
            sentences = []
            post_cnt = 0
            for post in root.findall('.//TEXT'):
                post_cnt += 1
                post = post.text.strip().strip()
                if post != '':
                    sentences.extend(sent_tokenize(post))
            avg_sentence_length = get_avg_sentence_length(sentences)
            
            sum_hours = 0
            post_cnt = 0
            for date in root.findall('.//DATE'):
                date = date.text.strip()
                if date != '':
                    post_cnt += 1
                    m = re.match(r'\d{4}-\d{2}-\d{2} (\d{2}).*', date)
                    hour = int(m.group(1))
                    sum_hours += hour
            
            time = [0] * 8
            avg_hour = sum_hours / post_cnt
            index = int(avg_hour // 3)
            time[index] = 1
            
            X_post_cnt.append([post_cnt])
            X_sentence_lengths.append([avg_sentence_length])
            X_times.append(time)

Reading input files:

In [7]:
cwd = os.getcwd()
TRAIN_PATH = os.path.join(cwd, "reddit-training-ready-to-share")
TEST_PATH = os.path.join(cwd, "reddit-test-data-ready-to-share")

TRAIN_POSITIVE_PATH = os.path.join(TRAIN_PATH, "positive_examples_anonymous")
TRAIN_NEGATIVE_PATH = os.path.join(TRAIN_PATH, "negative_examples_anonymous")

TEST_POSITIVE_PATH = os.path.join(TEST_PATH, "positive_examples_anonymous")
TEST_NEGATIVE_PATH = os.path.join(TEST_PATH, "negative_examples_anonymous")

TRAIN_LABELS_PATH = os.path.join(cwd, 'risk_golden_truth.txt')

IMAGE_STR = 'data:image'

train_labels_file = open(TRAIN_LABELS_PATH, 'r')
train_label_dict = {}
for line in train_labels_file:
    xml_file, label = line.split(' ')
    train_label_dict[xml_file] = label
train_labels_file.close()

In [8]:
X_train_raw = []
y_train = []
X_test_raw = []
y_test = []

train_entry_path_list = [TRAIN_POSITIVE_PATH, TRAIN_NEGATIVE_PATH]
test_pos_entry_path_list = [TEST_POSITIVE_PATH]
test_neg_entry_path_list = [TEST_NEGATIVE_PATH]

read_entries(X=X_train_raw, y=y_train, path_list=train_entry_path_list, label_dict=train_label_dict)
read_entries(X=X_test_raw, y=y_test, path_list=test_pos_entry_path_list, default_label=1)
read_entries(X=X_test_raw, y=y_test, path_list=test_neg_entry_path_list, default_label=0)

Extracting features:

In [None]:
X_times_train = []
X_times_test = []

X_sentence_len_train = []
X_sentence_len_test = []

X_post_cnt_train = []
X_post_cnt_test = []

read_features(X_times=X_times_train, X_sentence_lengths=X_sentence_len_train, X_post_cnt=X_post_cnt_train,
              path_list=train_entry_path_list)
read_features(X_times=X_times_test, X_sentence_lengths=X_sentence_len_test, X_post_cnt=X_post_cnt_test,
              path_list=test_pos_entry_path_list)
read_features(X_times=X_times_test, X_sentence_lengths=X_sentence_len_test, X_post_cnt=X_post_cnt_test,
              path_list=test_neg_entry_path_list)

In [64]:
def get_semantic_features(X):
    lexicon = Empath()
    relevant_lexical_categories = ['negative_emotion'
                              ]
    
    # not so good: speaking, business
    # good for LR: negative_emotion, positive_emotion, communication, violence, business, nervousness,
    # body, pain, sexual, healing, internet
    
    # maybe: government, kill, politics, envy
    relevant_lexical_categories2 = ['negative_emotion', 'positive_emotion', 'communication',
                                    'violence', 'business', 'nervousness', 'body', 'pain',
                                    'internet', 'work', 'shame', 'poor'
                              ]
    feature_mat = []
    for text in X:
        #d = lexicon.analyze(text, categories=['negative_emotion'], normalize=True)
        #feature_mat.append(list(d.values()))
        
        # u slucaju vise featura sljedeca linija ih sortira
        d = lexicon.analyze(text, categories=relevant_lexical_categories2, normalize=True)
        feature_mat.append([d[key] for key in sorted(d.keys(), reverse=False)])
    return feature_mat

In [65]:
X_sem_feat_train = get_semantic_features(X_train_raw)
X_sem_feat_test = get_semantic_features(X_test_raw)

We use X list as input to NLTKPreprocessor class which outputs list of preprocessed, tokenized texts:

In [12]:
preprocessor = NLTKPreprocessor()
preprocess_method = 'stem'
X_train_prep = preprocessor.transform(X_train_raw, method=preprocess_method)
X_test_prep = preprocessor.transform(X_test_raw, method=preprocess_method)

We use tf-idf vectorizer for vector representation of the documents:

In [87]:
vect = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 1), min_df=30)
X_train = vect.fit_transform(X_train_prep, y_train)
X_test = vect.transform(X_test_prep)

In [88]:
#print(vect.get_feature_names())

Adding hand-picked features:

In [89]:
chi2_selector = SelectKBest(chi2, k='all')
X_kbest_train = chi2_selector.fit_transform(X_train, y_train)
X_kbest_test = chi2_selector.transform(X_test)

Normalizing and adding features:

In [67]:
max_abs_scaler = MaxAbsScaler()
X_sentence_len_train_scaled = max_abs_scaler.fit_transform(X_sentence_len_train)
X_sentence_len_test_scaled = max_abs_scaler.transform(X_sentence_len_test)

X_sem_feat_train_scaled = max_abs_scaler.fit_transform(X_sem_feat_train)
X_sem_feat_test_scaled = max_abs_scaler.transform(X_sem_feat_test)

X_post_cnt_train_scaled = max_abs_scaler.fit_transform(X_post_cnt_train)
X_post_cnt_test_scaled = max_abs_scaler.transform(X_post_cnt_test)

In [91]:
X_train_2 = hstack([X_kbest_train, X_times_train, X_sentence_len_train_scaled,
                    X_sem_feat_train_scaled, X_post_cnt_train_scaled])

X_test_2 = hstack([X_kbest_test, X_times_test, X_sentence_len_test_scaled,
                   X_sem_feat_test_scaled, X_post_cnt_test_scaled])

Building and evaluating models:

In [93]:
model = LinearSVC(class_weight='balanced', C=1)

print("Cross validation for positive label scores:")
print("\tAverage precision score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(precision_score, average='macro', labels=[1]))))
print("\tAverage recall score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(recall_score, average='macro', labels=[1]))))
print("\tAverage  F1 score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(f1_score, average='macro', labels=[1]))))

print()
print("Cross validation for all label scores:")
print("\tAverage precision score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(precision_score, average='macro'))))
print("\tAverage recall score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(recall_score, average='macro'))))
print("\tAverage  F1 score:")
print('\t\t', np.mean(cross_val_score(model, X_train_2, y_train, cv=10, scoring=make_scorer(f1_score, average='macro'))))

Cross validation for positive label scores:
	Average precision score:
		 0.586688866689
	Average recall score:
		 0.722222222222
	Average  F1 score:
		 0.642222222222

Cross validation for all label scores:
	Average precision score:
		 0.763394227069
	Average recall score:
		 0.80641598916
	Average  F1 score:
		 0.778222103306


In [82]:
print("Building for evaluation: SVM classifier")

model = LinearSVC(class_weight='balanced')
model.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

Building for evaluation: SVM classifier
Evaluation model fit
Classification Report:

             precision    recall  f1-score   support

          0     0.9699    0.9148    0.9415       352
          1     0.5946    0.8148    0.6875        54

avg / total     0.9200    0.9015    0.9077       406



In [None]:
print("Building for evaluation: LogisticRegression classifier")

model = LogisticRegression(class_weight='balanced')
model.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

In [None]:
print("Building for evaluation: Voting classifier (SVM + LR)")

model1 = LogisticRegression(class_weight='balanced')
model2 = SVC(class_weight='balanced', kernel='linear', probability=True)
model = VotingClassifier(estimators=[('svm', model1), ('lr', model2)], voting='soft', weights=[2, 1])
model.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

Building the complete model on whole dataset:

In [None]:
X = vstack((X_train_2, X_test_2))
y = y_train + y_test

model_complete = LinearSVC(class_weight='balanced')
model_complete.fit(X, y)

print("Complete model fit.")

Most informative features:

In [None]:
print(show_most_informative_features(vect, model_complete))

Some baseline classifier testing:

In [None]:
vect2 = CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2), min_df=20)
X_train_3 = vect2.fit_transform(X_train_prep, y_train)
X_test_3 = vect2.transform(X_test_prep)

In [None]:
print("Building for evaluation: BernoulliNB classifier")

model2 = BernoulliNB()
model2.fit(X_train_3, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model2.predict(X_test_3)
print(classification_report(y_test, y_pred))

In [None]:
print("Building for evaluation: MultinomialNB classifier")

model3 = MultinomialNB()
model3.fit(X_train_3, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model3.predict(X_test_3)
print(classification_report(y_test, y_pred))

In [None]:
print("Building for evaluation: DecisionTree classifier")

model4 = DecisionTreeClassifier(class_weight='balanced')
model4.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model4.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

In [None]:
print("Building for evaluation: RandomForest classifier")

model4 = RandomForestClassifier(class_weight='balanced')
model4.fit(X_train_2, y_train_2)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model4.predict(X_test)
print(classification_report(y_test, y_pred))

Empath testing:

In [None]:
lexicon = Empath()
relevant_lexical_categories1 = ['help', 'medical_emergency', 'hate', 'health', 'suffering', 
                               'kill', 'fear', 'death', 'violence', 'love',
                               'anonymity', 'injury', 'appearance', 'sadness',
                               'emotional', 'ugliness', 'shame', 'torment',
                               'pain', 'negative_emotion', 'positive_emotion', 'friends',
                               'alcohol', 'nervousness', 'optimism', 'body', 'contentment'
                               'cold', 'school', 'communication', 'work', 'sleep', 'play'
                               'trust', 'social_media', 'sexual'
                              ]

relevant_lexical_categories = ['negative_emotion', 'speaking', 'positive_emotion', 'communication',
                               'friends', 'children', 'optimism', 'violence', 'pain', 'family',
                               'trust', 'love', 'party', 'business', 'home', 'shame', 'listen',
                               'giving', 'body', 'suffering', 'work', 'nervousness', 'strength',
                               'hearing', 'health', 'traveling', 'wedding', 'childish', 'hate',
                               'social_media', 'sadness', 'school'
                              ]

x_senti1 = []
y_senti1 = []
read_entries(X=x_senti1, y=y_senti1, path_list=test_pos_entry_path_list, default_label=1)

x_senti2 = []
y_senti2 = []
read_entries(X=x_senti2, y=y_senti2, path_list=test_neg_entry_path_list, default_label=0)

In [None]:
avg_dict1 = {}
len1 = len(x_senti1)
for i in x_senti1:
    d = lexicon.analyze(i, normalize=True)
    avg_dict1 = { k: d.get(k, 0)/len1 + avg_dict1.get(k, 0) for k in set(d) | set(avg_dict1) }
    #d = {k: v for k, v in d.items() if v > 0}
    
for k, v in sorted(avg_dict1.items(), key=lambda x: x[1], reverse=True):
    print(k, v)

In [None]:
avg_dict2 = {}
len2 = len(x_senti2)
for i in x_senti2:
    d = lexicon.analyze(i, normalize=True)
    avg_dict2 = { k: d.get(k, 0)/len2 + avg_dict2.get(k, 0) for k in set(d) | set(avg_dict2) }
    #d = {k: v for k, v in d.items() if v > 0}
    
for k, v in sorted(avg_dict2.items(), key=lambda x: x[1], reverse=True):
    print(k, v)

In [None]:
#print(list(lexicon.analyze(x_senti2[2], normalize=True).values()))
d = lexicon.analyze(x_senti2[2], normalize=True)
for w in sorted(d.keys(), reverse=False):
    print(w, d[w])

result = [d[key] for key in sorted(d.keys(), reverse=False)]
print()
print(result)

In [None]:
sen = 'It is not exactly a big deal, but a huge sigh of relief to get confirmation that the movie is on the right track. I love life.'
print(sent_tokenize(sen))

In [None]:
user_i = []
for i in range(len(y_test)):
    if(y_test[i] != y_pred[i]):
        user_i.append(i)

entry_lists = []
path_list = test_pos_entry_path_list + test_neg_entry_path_list
for path in path_list:
    entry_lists.append(os.scandir(path))
    
users = []

for list_of_entries in entry_lists:
    for entry in list_of_entries:
        root = etree.parse(entry.path).getroot()
        user_id = root[0].text
        users.append(user_id)

for i in user_i:
    print(users[i])

In [None]:
print(lexicon.analyze(X_test[0]))

In [None]:
example = ['I love beinggg retarded']
preprocessor = NLTKPreprocessor()
preprocess_method = 'stem'
example2 = preprocessor.transform(example, method=preprocess_method)

print(example2)