In [1]:
import string
import os
import regex as re
import numpy as np
from itertools import combinations, chain
import pickle
from datetime import datetime

from lxml import etree
from operator import itemgetter

from nltk.corpus import stopwords as sw
from nltk.corpus import wordnet as wn
from nltk import wordpunct_tokenize
from nltk import WordNetLemmatizer
from nltk import SnowballStemmer
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag

from sklearn.preprocessing import MaxAbsScaler, StandardScaler, MinMaxScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, f1_score, precision_score, recall_score

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import VotingClassifier

from scipy.sparse import vstack, hstack, coo_matrix

from empath import Empath
from textblob import TextBlob



Pickle up all data:

In [None]:
X_times_train = pickle.load(open("./dumps/X_times_train.p", "rb" ))
X_sentence_len_train = pickle.load(open("./dumps/X_sentence_len_train.p", "rb" ))
X_post_cnt_train = pickle.load(open("./dumps/X_post_cnt_train.p", "rb" ))
X_sentiment_train = pickle.load(open("./dumps/X_sentiment_train.p", "rb" ))
X_subjectivity_train = pickle.load(open("./dumps/X_subjectivity_train.p", "rb" ))

X_times_test = pickle.load(open("./dumps/X_times_test.p", "rb" ))
X_sentence_len_test = pickle.load(open("./dumps/X_sentence_len_test.p", "rb" ))
X_post_cnt_test = pickle.load(open("./dumps/X_post_cnt_test.p", "rb" ))
X_sentiment_test = pickle.load(open("./dumps/X_sentiment_test.p", "rb" ))
X_subjectivity_test = pickle.load(open("./dumps/X_subjectivity_test.p", "rb" ))

X_pos_tags_train = pickle.load(open( "X_pos_tags_train.p", "rb" ))
X_pos_tags_test = pickle.load(open( "X_pos_tags_test.p", "rb" ))

X_lexicon_sizes_train = pickle.load(open( "X_lexicon_sizes_train.p", "rb" ))
X_lexicon_sizes_test = pickle.load(open( "X_lexicon_sizes_test.p", "rb" ))

Class for Corpus preprocessing:

In [2]:
sw_diff = {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves', 'he', 'him', 'his', 'himself', 
           'she', 'her', 'hers', 'herself', 'you', 'your', 'yours', 'yourselves', 'they', 'them', 'their', 'theirs', 
           'themselves', 'absolutely', 'all', 'always', 'complete', 'completely', 'constant', 'constantly','definitely', 
           'entire', 'ever', 'every', 'everyone', 'everything', 'full', 'must', 'never', 'nothing', 'totally', 'whole',
           'just', 'only', 'noone', 'none', 'no', 'nobody', 'each', 'everybody'}

class NLTKPreprocessor(BaseEstimator, TransformerMixin):

    def __init__(self, stopwords=None, punct=None,
                 lower=True, strip=True):
        self.lower      = lower
        self.strip      = strip
        self.stopwords  = stopwords or set(sw.words('english'))
        self.stopwords.difference_update(sw_diff)
        self.punct      = punct or set(string.punctuation)
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = SnowballStemmer(language='english')

    def fit(self, X, y=None):
        return self

    def inverse_transform(self, X):
        return [" ".join(doc) for doc in X]

    def transform(self, X, method='lem'):
        return [
            list(self.tokenize(doc, method)) for doc in X
        ]   

    def tokenize(self, document, method='lem'):
        if(method == 'lem'):
            # Break the document into sentences
            for sent in sent_tokenize(document):
                # Break the sentence into part of speech tagged tokens
                for token, tag in pos_tag(wordpunct_tokenize(sent)):
                    # Apply preprocessing to the token
                    token = self.process_token(token)
                    if not self.is_valid_token(token):
                        continue
                        
                    # Lemmatize the token and yield
                    lemma = self.lemmatize(token, tag)
                    yield lemma
                    
        elif(method == 'stem'):
            # Break the document into tokens
            for token in wordpunct_tokenize(document):
                # Apply preprocessing to the token
                token = self.process_token(token)
                if not self.is_valid_token(token):
                    continue
                
                stem = self.stem(token)
                yield stem
        else:
            raise ValueError('Unknown method type.')

    def lemmatize(self, token, tag):
        tag = {
            'N': wn.NOUN,
            'V': wn.VERB,
            'R': wn.ADV,
            'J': wn.ADJ
        }.get(tag[0], wn.NOUN)

        return self.lemmatizer.lemmatize(token, tag)
    
    def stem(self, token):
        return self.stemmer.stem(token)
    
    def process_token(self, token):
        token = token.lower() if self.lower else token
        token = token.strip() if self.strip else tcharoken
        token = token.strip('_') if self.strip else token
        token = token.strip('*') if self.strip else token
        return token
    
    def is_valid_token(self, token):
        # If stopword, token is invalid
        if token in self.stopwords:
            return False

        # If punctuation, token is invalid
        if all(char in self.punct for char in token):
            return False
        
        return True

This part of code loads data corpus from multiple files into lists X (texts) and y(labels) with one entry per user:

In [3]:
def read_entries(X, y, path_list, label_dict=None, default_label=0):
    entry_lists = []
    for path in path_list:
        entry_lists.append(os.scandir(path))
    
    IMAGE_STR = 'data:image'
    
    for list_of_entries in entry_lists:
        for entry in list_of_entries:
            root = etree.parse(entry.path).getroot()
            user_id = root[0].text
        
            user_text = ''
            for post in root.findall('.//TITLE') + root.findall('.//TEXT'):
                post = post.text.strip().strip()
                if post != '':
                    if IMAGE_STR in post:
                        continue
                    post = re.sub(r"http\S+", " ", post)
                    post = re.sub(r"\d+", " ", post)
                    post = re.sub(u"\xa0", " ", post)
                    post = re.sub(u"\\p{P}+", " ", post)
                    user_text += ' ' + post.lower()
            
            X.append(user_text)
            label = int(label_dict[user_id]) if label_dict else default_label
            y.append(label)

Utility methods for extracting features:

In [4]:
def get_avg_sentence_length(sentences):
    sum = 0
    for sentence in sentences:
        sentence = sentence.replace(' ', '')
        sum += len(sentence)
    return sum / len(sentences) if sentences else 0

In [5]:
def get_sentiment_and_subjectivity(sentences):
    sum_sentiment = 0
    sum_subjectivity = 0
    if len(sentences) > 0:
        for sentence in sentences:
            tb = TextBlob(sentence)
            sum_sentiment += tb.sentiment.polarity
            sum_subjectivity += tb.sentiment.subjectivity
        sum_sentiment = sum_sentiment / float(len(sentences))
        sum_subjectivity = sum_subjectivity / float(len(sentences))
        return (sum_sentiment, sum_subjectivity)
    else:
        return (0.0, 0.0)
        

In [6]:
def read_features(X_times, X_sentence_lengths, X_post_cnt, X_sentiment,
                  X_subjectivity, X_post_lengths, X_post_freq, path_list):
    entry_lists = []
    for path in path_list:
        entry_lists.append(os.scandir(path))
        
    IMAGE_STR = 'data:image'
    datetime_pattern = '%Y-%m-%d %H:%M:%S'
    date_end = None
    date_start = None
    
    for list_of_entries in entry_lists:
        for entry in list_of_entries:
            root = etree.parse(entry.path).getroot()
            user_id = root[0].text
            
            sentences = []
            post_lengths = []
            post_cnt = 0
            for post in root.findall('.//TEXT'):
                post_cnt += 1
                post = post.text.strip()
                if post != '':
                    sentences.extend(sent_tokenize(post))
                    post_lengths.append(len(post))
                else:
                    post_lengths.append(0)
            
            avg_sentiment, avg_subjectivity = get_sentiment_and_subjectivity(sentences)
            avg_sentence_length = get_avg_sentence_length(sentences)
            avg_post_length = np.mean(post_lengths)
            
            sum_hours = 0
            for date in root.findall('.//DATE'):
                date = date.text.strip()
                if date != '':                    
                    if not date_end:
                        date_end = datetime.strptime(date, datetime_pattern)
                    date_start = datetime.strptime(date, datetime_pattern)
                    
                    m = re.match(r'\d{4}-\d{2}-\d{2} (\d{2}).*', date)
                    hour = int(m.group(1))
                    sum_hours += hour
            
            post_span_minutes = (date_end - date_start).total_seconds()/60
            post_freq = post_span_minutes / post_cnt
            
            time = [0] * 8
            avg_hour = sum_hours / post_cnt
            index = int(avg_hour // 3)
            time[index] = 1
            
            X_post_cnt.append([post_cnt])
            X_sentence_lengths.append([avg_sentence_length])
            X_times.append(time)
            X_sentiment.append([avg_sentiment])
            X_subjectivity.append([avg_subjectivity])
            X_post_lengths.append([avg_post_length])
            X_post_freq.append([post_freq])

Reading input files:

In [7]:
cwd = os.getcwd()
TRAIN_PATH = os.path.join(cwd, "reddit-training-ready-to-share")
TEST_PATH = os.path.join(cwd, "reddit-test-data-ready-to-share")

TRAIN_POSITIVE_PATH = os.path.join(TRAIN_PATH, "positive_examples_anonymous")
TRAIN_NEGATIVE_PATH = os.path.join(TRAIN_PATH, "negative_examples_anonymous")

TEST_POSITIVE_PATH = os.path.join(TEST_PATH, "positive_examples_anonymous")
TEST_NEGATIVE_PATH = os.path.join(TEST_PATH, "negative_examples_anonymous")

TRAIN_LABELS_PATH = os.path.join(cwd, 'risk_golden_truth.txt')

IMAGE_STR = 'data:image'

train_labels_file = open(TRAIN_LABELS_PATH, 'r')
train_label_dict = {}
for line in train_labels_file:
    xml_file, label = line.split(' ')
    train_label_dict[xml_file] = label
train_labels_file.close()

In [8]:
X_train_raw = []
y_train = []
X_test_raw = []
y_test = []

train_entry_path_list = [TRAIN_POSITIVE_PATH, TRAIN_NEGATIVE_PATH]
test_pos_entry_path_list = [TEST_POSITIVE_PATH]
test_neg_entry_path_list = [TEST_NEGATIVE_PATH]

read_entries(X=X_train_raw, y=y_train, path_list=train_entry_path_list, label_dict=train_label_dict)
read_entries(X=X_test_raw, y=y_test, path_list=test_pos_entry_path_list, default_label=1)
read_entries(X=X_test_raw, y=y_test, path_list=test_neg_entry_path_list, default_label=0)

In [9]:
def get_pronoun_and_absolutizm_features(X_raw):
    
    fp_pronouns = {'i', 'me', 'my', 'mine', 'myself', 'we', 'us', 'our', 'ours', 'ourselves'}
    
    tp_pronouns = {'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself',
                   'you', 'your', 'yours', 'yourselves', 'they', 'them', 'their', 'theirs', 'themselves'}
    
    absolutisms = {'absolutely', 'all', 'always', 'complete', 'completely', 'constant', 'constantly','definitely', 
                   'entire', 'ever', 'every', 'everyone', 'everything', 'full', 'must', 'never', 'nothing', 
                   'totally', 'whole', 'just', 'only', 'noone', 'none', 'no', 'nobody', 'each', 'everybody'}


    fp_freq = []
    tp_freq = []
    absolutisms_freq = []
    
    for entry in X_raw:
        sum_fp = 0
        sum_tp = 0
        sum_abs = 0
        tokens = word_tokenize(entry)
        for word in tokens:
            if word in fp_pronouns:
                sum_fp += 1
            elif word in tp_pronouns:
                sum_tp += 1
            elif word in absolutisms:
                sum_abs += 1
        sum_fp = sum_fp / float(len(tokens))
        sum_tp = sum_tp / float(len(tokens))
        sum_abs = sum_abs / float(len(tokens))
        fp_freq.append([sum_fp])
        tp_freq.append([sum_tp])
        absolutisms_freq.append([sum_abs])
    
    return (fp_freq, tp_freq, absolutisms_freq)        

Extracting features:

In [10]:
X_times_train = []
X_times_test = []

X_sentence_len_train = []
X_sentence_len_test = []

X_post_cnt_train = []
X_post_cnt_test = []

X_sentiment_train = []
X_sentiment_test = []

X_subjectivity_train = []
X_subjectivity_test = []

X_post_lengths_train = []
X_post_lengths_test = []

X_post_freq_train = []
X_post_freq_test = []

read_features(X_times=X_times_train, X_sentence_lengths=X_sentence_len_train, X_post_cnt=X_post_cnt_train,
              X_sentiment=X_sentiment_train, X_subjectivity=X_subjectivity_train, X_post_lengths=X_post_lengths_train,
              X_post_freq=X_post_freq_train, path_list=train_entry_path_list)
read_features(X_times=X_times_test, X_sentence_lengths=X_sentence_len_test, X_post_cnt=X_post_cnt_test,
              X_sentiment=X_sentiment_test, X_subjectivity=X_subjectivity_test, X_post_lengths=X_post_lengths_test,
              X_post_freq=X_post_freq_test, path_list=test_pos_entry_path_list)
read_features(X_times=X_times_test, X_sentence_lengths=X_sentence_len_test, X_post_cnt=X_post_cnt_test,
              X_sentiment=X_sentiment_test, X_subjectivity=X_subjectivity_test, X_post_lengths=X_post_lengths_test,
              X_post_freq=X_post_freq_test, path_list=test_neg_entry_path_list)

In [11]:
X_fp_pronouns_train, X_tp_pronouns_train, X_absolutisms_train = get_pronoun_and_absolutizm_features(X_train_raw)
X_fp_pronouns_test, X_tp_pronouns_test, X_absolutisms_test = get_pronoun_and_absolutizm_features(X_test_raw)

In [12]:
def get_semantic_features(X):
    lexicon = Empath()
    
    relevant_lexical_categories = ['negative_emotion', 'positive_emotion', 'communication',
                                    'violence', 'business', 'nervousness', 'body', 'pain',
                                    'internet', 'work', 'shame', 'poor'
                              ]
    
    relevant_lexical_categories2 = ['negative_emotion', 'positive_emotion',
                                   'nervousness', 'love', 'shame', 'pain'
                              ]
    
    feature_mat = []
    for text in X:
        d = lexicon.analyze(text, categories=relevant_lexical_categories2, normalize=True)
        feature_mat.append([d[key] for key in sorted(d.keys(), reverse=False)])
    return feature_mat

In [13]:
X_sem_feat_train = get_semantic_features(X_train_raw)
X_sem_feat_test = get_semantic_features(X_test_raw)

In [None]:
pickle.dump(X_times_train, open("./dumps/X_times_train.p", "wb" ))
pickle.dump(X_sentence_len_train, open("./dumps/X_sentence_len_train.p", "wb" ))
pickle.dump(X_post_cnt_train, open("./dumps/X_post_cnt_train.p", "wb" ))
pickle.dump(X_sentiment_train, open("./dumps/X_sentiment_train.p", "wb" ))
pickle.dump(X_subjectivity_train, open("./dumps/X_subjectivity_train.p", "wb" ))
pickle.dump(X_post_lengths_train, open("./dumps/X_post_lengths_train.p", "wb" ))
pickle.dump(X_post_freq_train, open("./dumps/X_post_freq_train.p", "wb" ))
pickle.dump(X_fp_pronouns_train, open("./dumps/X_fp_pronouns_train.p", "wb" ))
pickle.dump(X_tp_pronouns_train, open("./dumps/X_tp_pronouns_train.p", "wb" ))
pickle.dump(X_absolutisms_train, open("./dumps/X_absolutisms_train.p", "wb" ))

pickle.dump(X_times_test, open("./dumps/X_times_test.p", "wb" ))
pickle.dump(X_sentence_len_test, open("./dumps/X_sentence_len_test.p", "wb" ))
pickle.dump(X_post_cnt_test, open("./dumps/X_post_cnt_test.p", "wb" ))
pickle.dump(X_sentiment_test, open("./dumps/X_sentiment_test.p", "wb" ))
pickle.dump(X_subjectivity_test, open("./dumps/X_subjectivity_test.p", "wb" ))
pickle.dump(X_post_lengths_test, open("./dumps/X_post_lengths_test.p", "wb" ))
pickle.dump(X_post_freq_test, open("./dumps/X_post_freq_test.p", "wb" ))
pickle.dump(X_fp_pronouns_test, open("./dumps/X_fp_pronouns_test.p", "wb" ))
pickle.dump(X_tp_pronouns_test, open("./dumps/X_tp_pronouns_test.p", "wb" ))
pickle.dump(X_absolutisms_test, open("./dumps/X_absolutisms_test.p", "wb" ))

pickle.dump(X_sem_feat_train, open("./dumps/X_sem_feat_train.p", "wb" ))
pickle.dump(X_sem_feat_test, open("./dumps/X_sem_feat_test.p", "wb" ))

We use X list as input to NLTKPreprocessor class which outputs list of preprocessed, tokenized texts:

In [14]:
preprocessor = NLTKPreprocessor()
preprocess_method = 'stem'
X_train_prep = preprocessor.transform(X_train_raw, method=preprocess_method)
X_test_prep = preprocessor.transform(X_test_raw, method=preprocess_method)

In [15]:
def get_pos_tags(X):
    pos_tag_mat = []
    for tokens in X:
        tag_dict = { 'CC': 0, 'DT': 0, 'IN': 0, 'JJ': 0, 'JJR': 0, 'JJS': 0,
                    'NN': 0, 'NNP':0, 'NNS': 0, 'PRP': 0, 'PRP$': 0, 'RB': 0,
                    'RBR': 0, 'RBS': 0, 'RP': 0, 'VB': 0, 'VBD': 0, 'VBG': 0,
                    'VBN': 0, 'VBP': 0, 'VBZ': 0}
        
        text_len = len(tokens)
        tags = pos_tag(tokens)
        
        for word, tag in tags:
            if tag in tag_dict.keys():
                tag_dict[tag] += 1/text_len
        
        tag_freq = [tag_dict[key] for key in sorted(tag_dict.keys(), reverse=False)]
        pos_tag_mat.append(tag_freq)
    
    return pos_tag_mat

In [16]:
X_pos_tags_train = get_pos_tags(X_train_prep)
X_pos_tags_test = get_pos_tags(X_test_prep)

In [None]:
pickle.dump(X_pos_tags_train, open("./dumps/X_pos_tags_train.p", "wb" ))
pickle.dump(X_pos_tags_test, open("./dumps/X_pos_tags_test.p", "wb" ))

In [17]:
def get_lexicon_sizes(X):
    unique_cnt_mat = []
    for tokens in X:
        unique_cnt_mat.append([len(set(tokens))])
    
    return unique_cnt_mat

In [18]:
X_lexicon_sizes_train = get_lexicon_sizes(X_train_prep)
X_lexicon_sizes_test = get_lexicon_sizes(X_test_prep)

In [None]:
pickle.dump(X_lexicon_sizes_train, open("./dumps/X_lexicon_sizes_train.p", "wb" ))
pickle.dump(X_lexicon_sizes_test, open("./dumps/X_lexicon_sizes_test.p", "wb" ))

We use tf-idf vectorizer for vector representation of the documents:

In [21]:
def identity(arg):
    """
    Simple identity function works as a passthrough.
    """
    return arg

In [22]:
vect = TfidfVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 1), min_df=30)
X_train = vect.fit_transform(X_train_prep, y_train)
X_test = vect.transform(X_test_prep)

In [40]:
#print(vect.get_feature_names())
#print(X_train.getnnz())

762629


Normalizing and adding features:

In [23]:
max_abs_scaler = MaxAbsScaler()
X_sentence_len_train_scaled = max_abs_scaler.fit_transform(X_sentence_len_train)
X_sentence_len_test_scaled = max_abs_scaler.transform(X_sentence_len_test)

X_sem_feat_train_scaled = max_abs_scaler.fit_transform(X_sem_feat_train)
X_sem_feat_test_scaled = max_abs_scaler.transform(X_sem_feat_test)

X_post_cnt_train_scaled = max_abs_scaler.fit_transform(X_post_cnt_train)
X_post_cnt_test_scaled = max_abs_scaler.transform(X_post_cnt_test)

X_sentiment_train_scaled = max_abs_scaler.fit_transform(X_sentiment_train)
X_sentiment_test_scaled = max_abs_scaler.transform(X_sentiment_test)

X_subjectivity_train_scaled = max_abs_scaler.fit_transform(X_subjectivity_train)
X_subjectivity_test_scaled = max_abs_scaler.transform(X_subjectivity_test)

X_fp_pronouns_train_scaled = max_abs_scaler.fit_transform(X_fp_pronouns_train)
X_fp_pronouns_test_scaled = max_abs_scaler.transform(X_fp_pronouns_test)

X_tp_pronouns_train_scaled = max_abs_scaler.fit_transform(X_tp_pronouns_train)
X_tp_pronouns_test_scaled = max_abs_scaler.transform(X_tp_pronouns_test)

X_absolutisms_train_scaled = max_abs_scaler.fit_transform(X_absolutisms_train)
X_absolutisms_test_scaled = max_abs_scaler.transform(X_absolutisms_test)

X_pos_tags_train_scaled = max_abs_scaler.fit_transform(X_pos_tags_train)
X_pos_tags_test_scaled = max_abs_scaler.fit_transform(X_pos_tags_test)

X_lexicon_sizes_train_scaled = max_abs_scaler.fit_transform(X_lexicon_sizes_train)
X_lexicon_sizes_test_scaled = max_abs_scaler.fit_transform(X_lexicon_sizes_test)

X_post_lengths_train_scaled = max_abs_scaler.fit_transform(X_post_lengths_train)
X_post_lengths_test_scaled = max_abs_scaler.fit_transform(X_post_lengths_test)

X_post_freq_train_scaled = max_abs_scaler.fit_transform(X_post_freq_train)
X_post_freq_test_scaled = max_abs_scaler.fit_transform(X_post_freq_test)

Building and evaluating models:

In [24]:
#helper function to find allsubsets of a set (needed to find all subsets of set of new feature for CV)
allsubsets = lambda n: list(chain(*[combinations(range(n), ni) for ni in range(n+1)]))

# r - return only subsets of r size (reduction of search space) + empty set; if None, return all subsets
def get_subsets(n, r=None):
    if r==None:
        return allsubsets(n)
    else:
        combs = list(combinations(range(n), r))
        combs.append(())
        return combs

In [43]:
print(len(get_subsets(10, 8)))

46


In [56]:
#Function for k-fold cross-validation of a model
#Performs grid search on C parameter, subsets of new features and k-best token features by chi2 stat
#For model_name parameter use 'SVM' - SVM or 'LR' - LogisticRegression

#EXAMPLE: see example below baseline for CV of SVM model
def cross_validate_model(model_name, X_train, y_train, C_list, k_list, new_features, k_folds, subset_size=None,
                        regularization='l2'):
    feature_subsets = get_subsets(len(new_features), subset_size)
    best_C = 0
    best_k = 0
    best_feature_set = {}
    best_score = -1
    scorer = make_scorer(f1_score, average='macro', labels=[1])
    
    for k_features in k_list:
        
        chi2_selector = SelectKBest(chi2, k=k_features)
        X_kbest_train = chi2_selector.fit_transform(X_train, y_train)
        
        for c in C_list:
            
            if model_name == 'SVM':
                model = LinearSVC(class_weight='balanced', C=c, penalty=regularization) 
            elif model_name == 'LR':
                model = LogisticRegression(class_weight='balanced', C=c, penalty=regularization)
            
            for subset in feature_subsets:
                X_new = X_kbest_train
                for i in subset:
                    X_new = hstack([X_new, new_features[i]])
                scores = cross_val_score(model, X_new, y_train, cv=k_folds, scoring=scorer)
                mean = scores.mean()
                if mean > best_score:
                    best_score = mean
                    best_C = c
                    best_k = k_features
                    best_feature_set = subset
         
    return (best_score, best_k, best_C, best_feature_set)

In [26]:
print("Building for evaluation: BASELINE")

model = LinearSVC(class_weight='balanced')
model.fit(X_train, y_train)

print("Classification Report:\n")
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

Building for evaluation: BASELINE
Classification Report:

             precision    recall  f1-score   support

          0     0.9654    0.8722    0.9164       352
          1     0.4886    0.7963    0.6056        54

avg / total     0.9020    0.8621    0.8751       406



In [57]:
#Cross validation of SVM
#C_range = 10. ** np.arange(-3, 8)
#k_range = np.arange(10, 3000, 300)
C_range = [0.01, 0.1, 1, 10, 100]
k_range = [100, 1500, 3000, 'all']
new_features = [X_times_train, X_sem_feat_train_scaled,
                X_sentiment_train_scaled, X_subjectivity_train_scaled, X_fp_pronouns_train_scaled,
                X_tp_pronouns_train_scaled, X_absolutisms_train_scaled, X_pos_tags_train_scaled,
                X_lexicon_sizes_train_scaled, X_post_freq_train_scaled]

#best_feature_set vraća kao tuple indexa u listi new_features (gore)
best_score, best_k, best_C, best_feature_set = cross_validate_model('SVM', X_train, y_train, C_range,
                                                                   k_range, new_features, 4, 9)

print("Best score: " + str(best_score))
print("Best C: " + str(best_C))
print("Best k: " + str(best_k))
print("Best new feature subset: " + str(best_feature_set))

Best score: 0.7027198542290081
Best C: 100
Best k: 1500
Best new feature subset: (0, 1, 2, 3, 4, 5, 6, 8, 9)


In [55]:
new_features_test = [X_times_test, X_sem_feat_test_scaled,
                    X_sentiment_test_scaled, X_subjectivity_test_scaled, X_fp_pronouns_test_scaled,
                    X_tp_pronouns_test_scaled, X_absolutisms_test_scaled, X_pos_tags_test_scaled,
                    X_lexicon_sizes_test_scaled, X_post_freq_test_scaled]

In [54]:
print("Building for evaluation: SVM classifier")

model = LinearSVC(class_weight='balanced', C=100)

chi2_selector = SelectKBest(chi2, k=1500)
X_kbest_train = chi2_selector.fit_transform(X_train, y_train)
X_kbest_test = chi2_selector.transform(X_test)

X_final_train = X_kbest_train
for i in best_feature_set:
    X_final_train = hstack([X_final_train, new_features[i]])
    
X_final_test = X_kbest_test
for i in best_feature_set:
    X_final_test = hstack([X_final_test, new_features_test[i]])

model.fit(X_final_train, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_final_test)
print(classification_report(y_test, y_pred, digits=4))

Building for evaluation: SVM classifier
Evaluation model fit
Classification Report:

             precision    recall  f1-score   support

          0     0.9300    0.9062    0.9180       352
          1     0.4762    0.5556    0.5128        54

avg / total     0.8697    0.8596    0.8641       406



In [None]:
print("Building for evaluation: LogisticRegression classifier")

model = LogisticRegression(class_weight='balanced')
model.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

In [None]:
print("Building for evaluation: Voting classifier (SVM + LR)")

model1 = LogisticRegression(class_weight='balanced')
model2 = SVC(class_weight='balanced', kernel='linear', probability=True)
model = VotingClassifier(estimators=[('svm', model1), ('lr', model2)], voting='soft', weights=[2, 1])
model.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

Building the complete model on whole dataset:

In [None]:
X = vstack((X_train_2, X_test_2))
y = y_train + y_test

model_complete = LinearSVC(class_weight='balanced')
model_complete.fit(X, y)

print("Complete model fit.")

Most informative features:

In [None]:
print(show_most_informative_features(vect, model_complete))

Some baseline classifier testing:

In [None]:
vect2 = CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False, ngram_range=(1, 2), min_df=20)
X_train_3 = vect2.fit_transform(X_train_prep, y_train)
X_test_3 = vect2.transform(X_test_prep)

In [None]:
print("Building for evaluation: BernoulliNB classifier")

model2 = BernoulliNB()
model2.fit(X_train_3, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model2.predict(X_test_3)
print(classification_report(y_test, y_pred))

In [None]:
print("Building for evaluation: MultinomialNB classifier")

model3 = MultinomialNB()
model3.fit(X_train_3, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model3.predict(X_test_3)
print(classification_report(y_test, y_pred))

In [None]:
print("Building for evaluation: DecisionTree classifier")

model4 = DecisionTreeClassifier(class_weight='balanced')
model4.fit(X_train_2, y_train)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model4.predict(X_test_2)
print(classification_report(y_test, y_pred, digits=4))

In [None]:
print("Building for evaluation: RandomForest classifier")

model4 = RandomForestClassifier(class_weight='balanced')
model4.fit(X_train_2, y_train_2)

print("Evaluation model fit")
print("Classification Report:\n")

y_pred = model4.predict(X_test)
print(classification_report(y_test, y_pred))

Empath testing:

In [None]:
lexicon = Empath()
relevant_lexical_categories1 = ['help', 'medical_emergency', 'hate', 'health', 'suffering', 
                               'kill', 'fear', 'death', 'violence', 'love',
                               'anonymity', 'injury', 'appearance', 'sadness',
                               'emotional', 'ugliness', 'shame', 'torment',
                               'pain', 'negative_emotion', 'positive_emotion', 'friends',
                               'alcohol', 'nervousness', 'optimism', 'body', 'contentment'
                               'cold', 'school', 'communication', 'work', 'sleep', 'play'
                               'trust', 'social_media', 'sexual'
                              ]

relevant_lexical_categories = ['negative_emotion', 'speaking', 'positive_emotion', 'communication',
                               'friends', 'children', 'optimism', 'violence', 'pain', 'family',
                               'trust', 'love', 'party', 'business', 'home', 'shame', 'listen',
                               'giving', 'body', 'suffering', 'work', 'nervousness', 'strength',
                               'hearing', 'health', 'traveling', 'wedding', 'childish', 'hate',
                               'social_media', 'sadness', 'school'
                              ]

x_senti1 = []
y_senti1 = []
read_entries(X=x_senti1, y=y_senti1, path_list=test_pos_entry_path_list, default_label=1)

x_senti2 = []
y_senti2 = []
read_entries(X=x_senti2, y=y_senti2, path_list=test_neg_entry_path_list, default_label=0)

In [None]:
avg_dict1 = {}
len1 = len(x_senti1)
for i in x_senti1:
    d = lexicon.analyze(i, normalize=True)
    avg_dict1 = { k: d.get(k, 0)/len1 + avg_dict1.get(k, 0) for k in set(d) | set(avg_dict1) }
    #d = {k: v for k, v in d.items() if v > 0}
    
for k, v in sorted(avg_dict1.items(), key=lambda x: x[1], reverse=True):
    print(k, v)

In [None]:
avg_dict2 = {}
len2 = len(x_senti2)
for i in x_senti2:
    d = lexicon.analyze(i, normalize=True)
    avg_dict2 = { k: d.get(k, 0)/len2 + avg_dict2.get(k, 0) for k in set(d) | set(avg_dict2) }
    #d = {k: v for k, v in d.items() if v > 0}
    
for k, v in sorted(avg_dict2.items(), key=lambda x: x[1], reverse=True):
    print(k, v)

In [None]:
#print(list(lexicon.analyze(x_senti2[2], normalize=True).values()))
d = lexicon.analyze(x_senti2[2], normalize=True)
for w in sorted(d.keys(), reverse=False):
    print(w, d[w])

result = [d[key] for key in sorted(d.keys(), reverse=False)]
print()
print(result)

In [None]:
sen = 'It is not exactly a big deal, but a huge sigh of relief to get confirmation that the movie is on the right track. I love life.'
print(sent_tokenize(sen))

In [None]:
user_i = []
for i in range(len(y_test)):
    if(y_test[i] != y_pred[i]):
        user_i.append(i)

entry_lists = []
path_list = test_pos_entry_path_list + test_neg_entry_path_list
for path in path_list:
    entry_lists.append(os.scandir(path))
    
users = []

for list_of_entries in entry_lists:
    for entry in list_of_entries:
        root = etree.parse(entry.path).getroot()
        user_id = root[0].text
        users.append(user_id)

for i in user_i:
    print(users[i])

In [None]:
print(lexicon.analyze(X_test[0]))

In [None]:
example = ['I love beinggg retarded']
preprocessor = NLTKPreprocessor()
preprocess_method = 'stem'
example2 = preprocessor.transform(example, method=preprocess_method)

print(example2)