In [1]:
from mord import *
from numpy import average
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Tweet object

In [2]:
class Tweet(object):
    def __init__(self, message, res):
        self.message = message
        self.res = res

    def __str__(self):
        return str(self.message) + " " + str(self.res)


def get_tweet(str_tweet, res_acc=1):
    num, message, common_class, res = str_tweet.split('\t')
    return Tweet(message, float(res[0:res_acc]))


def get_tweets(str_tweets, res_acc=1):
    return [get_tweet(line, res_acc) for line in str_tweets.split('\n') if len(line) > 0]


# Features

In [3]:
def get_XY_char_ngrams(tweets):
    vectorizer = CountVectorizer(analyzer='char_wb', max_features=500, ngram_range=(3, 8))
    X = vectorizer.fit_transform([t.message for t in tweets]).toarray()
    Y = [t.res for t in tweets]
    return X, Y

def get_XY_word_ngrams(tweets):
    vectorizer = CountVectorizer(max_features=500)
    X = vectorizer.fit_transform([t.message for t in tweets]).toarray()
    Y = [t.res for t in tweets]
    return X, Y


In [4]:
from math import ceil

from features.nrc_lexicon import get_lexicon


def count_caps(tweet):
    caps = 0
    for word in tweet.message.split():
        caps += (len(word) > 2) & (word.isupper())
    return caps


def count_symbol(tweet, symbol):
    return tweet.message.count(symbol)


def starts_with_vowel(tweet):
    return tweet.message[0] in 'AaIiEeUuOo'


def count_intensity(tweet, emotion):
    lex = get_lexicon()
    intensity = 0.0
    for word in tweet.message.split():
        intensity += lex.get(emotion + '---' + word, 0.0)
    return ceil(intensity)


### Get tweets

In [5]:
EMOTION = 'anger'
FILENAME = '/home/vanyadeg/Desktop/diploma/data/EIoc/EI-oc-En-' + EMOTION + '-train.txt'

file = open(FILENAME, 'r')
tweets = get_tweets(file.read())

### Remove hashtags

In [6]:
def remove_hashtags(tweets):
    for tweet in tweets:
        tweet.message.replace('#', '')

remove_hashtags(tweets)

### Get features(word n-grams)

In [7]:
def add_features(X, tweets, emotion):
    X_list = X.tolist()
    for x, tweet in zip(X_list, tweets):
        x.append(count_caps(tweet))
        x.append(count_symbol(tweet, '!'))
        x.append(count_intensity(tweet, emotion))
    return np.array(X_list)


X, Y = get_XY_word_ngrams(tweets)
X = add_features(X, tweets, EMOTION)
X = VarianceThreshold().fit_transform(X)
Y = [int(y) for y in Y]

## Ordinal Classifier

In [8]:
def get_side(y, border):
    return int(y > border)


class OrdinalClassifier(object):
    def __init__(self, clf_supplier):
        self.clf_supplier = clf_supplier
        
        
    def fit(self, X, Y):
        X, Y = list(X), list(Y)
        self.classifiers = [self.get_classifier(X, Y, b) for b in [0, 1, 2]]
        
    
    def predict(self, X):
        X = list(X)
        return [self.predict_one(x) for x in X]
    
    
    def predict_one(self, x):
        results = [clf.predict_proba([x]).tolist()[0] for clf in self.classifiers]
        probs = [1 - results[0][1]]
        for idx in range(len(results) - 1):
            probs.append(results[idx][1] - results[idx + 1][1])
        probs.append(results[-1][1])
        answer = probs.index(max(probs))
        return answer
    
    
    def get_classifier(self, X, Y, border):
        border_y = [get_side(y, border) for y in Y]
        clf = self.clf_supplier()
        clf.fit(X, border_y)
        return clf
        
    
    def __str__(self):
        return 'Ordinal Classifier(' + str(self.clf_supplier()) +')'

    
def filter_index(X, index):
    return [X[i] for i in index]



def test_classifier(clf, train_X, train_Y, test_X, test_Y, metrics):
    clf.fit(np.array(train_X), np.array(train_Y))
    predicted = clf.predict(np.array(test_X))
    acc = accuracy_score(np.array(test_Y), predicted)
    f1 = f1_score(np.array(test_Y), predicted, average='macro')
    metrics.append((acc, f1))

## Classifiers

In [9]:
def get_classifiers(): 
    classifiers = []
    for clf_supplier in [lambda: MultinomialNB(alpha=1.0)
                     , lambda: AdaBoostClassifier()
                     , lambda: DecisionTreeClassifier()]:
        classifiers.append(clf_supplier())
        classifiers.append(OrdinalClassifier(clf_supplier))
    for clf in [LogisticSE(max_iter=10 ** 6), LogisticIT(max_iter=10 ** 6), LogisticAT(max_iter=10 ** 6)]:
        classifiers.append(clf)
    return classifiers

In [10]:
def fit_predict(X, Y):
    kf = KFold(n_splits=10, shuffle=True)
    for clf in get_classifiers():
        metrics = []
        for train_index, test_index in kf.split(X):
            train_X = filter_index(X, train_index)
            train_Y = filter_index(Y, train_index)
            test_X = filter_index(X, test_index)
            test_Y = filter_index(Y, test_index)
            test_classifier(clf, train_X, train_Y, test_X, test_Y, metrics)
        accuracies = [x[0] for x in metrics]
        f1_scores = [x[1] for x in metrics]
        print ("Average accuracy:" + str(clf) + ": " + str(average(accuracies)))
        print ("Average F1-score:" + str(clf) + ": " + str(average(f1_scores)))
        
fit_predict(X, Y)

Average accuracy:MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.465056759546
Average F1-score:MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.447756164061
Average accuracy:Ordinal Classifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)): 0.467997936017
Average F1-score:Ordinal Classifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)): 0.455379722972
Average accuracy:AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None): 0.430925352597
Average F1-score:AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None): 0.38255546006


  'precision', 'predicted', average, warn_for)


Average accuracy:Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)): 0.40385620915
Average F1-score:Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)): 0.268101618359
Average accuracy:DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'): 0.402091503268
Average F1-score:DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2

### Get features(char n-grams)

In [11]:
X, Y = get_XY_char_ngrams(tweets)
X = add_features(X, tweets, EMOTION)
X = VarianceThreshold().fit_transform(X)
Y = [int(y) for y in Y]

In [12]:
fit_predict(X, Y)

Average accuracy:MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.378582731338
Average F1-score:MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True): 0.36756271347
Average accuracy:Ordinal Classifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)): 0.389748882009
Average F1-score:Ordinal Classifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)): 0.37344277674
Average accuracy:AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None): 0.411517027864
Average F1-score:AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None): 0.384787282027
Average accuracy:Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)): 0.382707258342
Average F1-score:Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
