In [1]:
from mord import *
from numpy import average
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import KFold
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

# Tweet object

In [2]:
class Tweet(object):
    def __init__(self, message, res):
        self.message = message
        self.res = res

    def __str__(self):
        return str(self.message) + " " + str(self.res)


def get_tweet(str_tweet, res_acc=1):
    num, message, common_class, res = str_tweet.split('\t')
    return Tweet(message, float(res[0:res_acc]))


def get_tweets(str_tweets, res_acc=1):
    return [get_tweet(line, res_acc) for line in str_tweets.split('\n') if len(line) > 0]


# Features

In [3]:
def get_XY_char_ngrams(tweets, max_features=800):
    vectorizer = CountVectorizer(analyzer='char_wb', max_features=max_features, ngram_range=(3, 8))
    X = vectorizer.fit_transform([t.message for t in tweets]).toarray()
    Y = [int(t.res) for t in tweets]
    return X, Y

def get_XY_word_ngrams(tweets, max_features=800):
    vectorizer = CountVectorizer(max_features=max_features)
    X = vectorizer.fit_transform([t.message for t in tweets]).toarray()
    Y = [int(t.res) for t in tweets]
    return X, Y


In [4]:
from math import ceil

from features.nrc_lexicon import get_lexicon


def count_caps(tweet):
    caps = 0
    for word in tweet.message.split():
        caps += (len(word) > 2) & (word.isupper())
    return caps


def count_symbol(tweet, symbol):
    return tweet.message.count(symbol)


def starts_with_vowel(tweet):
    return tweet.message[0] in 'AaIiEeUuOo'


def count_intensity(tweet, emotion):
    lex = get_lexicon()
    intensity = 0.0
    for word in tweet.message.split():
        intensity += lex.get(emotion + '---' + word, 0.0)
    return ceil(intensity)


### Get tweets

In [5]:
EMOTION = 'anger'
FILENAME = '/home/vanyadeg/Desktop/diploma/data/EIoc/EI-oc-En-' + EMOTION + '-train.txt'

file = open(FILENAME, 'r')
tweets = get_tweets(file.read())

### Get additional tweets

In [6]:
ADD_FILENAME = '/home/vanyadeg/dev/ifmo-software-design-hw/SELab2/new_tweets.txt'

file = open(ADD_FILENAME, 'r')
add_tweets = get_tweets(file.read())

### Remove hashtags

In [7]:
def remove_hashtags(tweets):
    for tweet in tweets:
        tweet.message.replace('#', '')

remove_hashtags(tweets)
remove_hashtags(add_tweets)

In [8]:
def add_features(X, tweets, emotion):
    X_list = X.tolist()
    for x, tweet in zip(X_list, tweets):
        x.append(count_caps(tweet))
        x.append(count_symbol(tweet, '!'))
        x.append(count_intensity(tweet, emotion))
    return np.array(X_list)

## Ordinal Classifier

In [9]:
def get_side(y, border):
    return int(y > border)


class OrdinalClassifier(object):
    def __init__(self, clf_supplier):
        self.clf_supplier = clf_supplier
        
        
    def fit(self, X, Y):
        X, Y = list(X), list(Y)
        self.classifiers = [self.get_classifier(X, Y, b) for b in [0, 1, 2]]
        
    
    def predict(self, X):
        X = list(X)
        return [self.predict_one(x) for x in X]
    
    
    def predict_one(self, x):
        results = [clf.predict_proba([x]).tolist()[0] for clf in self.classifiers]
        probs = [1 - results[0][1]]
        for idx in range(len(results) - 1):
            probs.append(results[idx][1] - results[idx + 1][1])
        probs.append(results[-1][1])
        answer = probs.index(max(probs))
        return answer
    
    
    def get_classifier(self, X, Y, border):
        border_y = np.array([get_side(y, border) for y in Y])
        clf = self.clf_supplier()
        clf.fit(X, border_y)
        return clf
        
    
    def __str__(self):
        return 'Ordinal Classifier(' + str(self.clf_supplier()) +')'

    
def filter_index(X, index):
    return [X[i] for i in index]



def test_classifier(clf_supplier, train_X, train_Y, test_X, test_Y, addX, addY, metrics):
    clf = clf_supplier()
    trX = np.concatenate((np.array(train_X), np.array(addX)))
    trY = np.concatenate((np.array(train_Y), np.array(addY)))
    clf.fit(trX, trY)
    predicted = clf.predict(np.array(test_X))
    acc = accuracy_score(np.array(test_Y), predicted)
    f1 = f1_score(np.array(test_Y), predicted, average='macro')
    metrics.append((acc, f1))

## Classifiers

In [None]:
def get_clf_suppliers(): 
    classifiers = []
    regular_classifiers = [lambda: MultinomialNB(alpha=10**(-1))
                     , lambda: AdaBoostClassifier()
                     , lambda: DecisionTreeClassifier()]
    classifiers += regular_classifiers
    classifiers += [lambda suppl=clf_supplier: OrdinalClassifier(suppl) for clf_supplier in regular_classifiers]
    ordinal_classifiers = [lambda: LogisticSE(max_iter=10 ** 6), 
                lambda: LogisticIT(max_iter=10 ** 6), 
                lambda: LogisticAT(max_iter=10 ** 6)]
    classifiers += ordinal_classifiers
    return classifiers

In [None]:
def fit_predict(X, Y, addX, addY):
    f1_clf = {}
    kf = KFold(n_splits=10, shuffle=True)
    suppliers = get_clf_suppliers()
    for idx, suppl in enumerate(suppliers):
        metrics = []
        for train_index, test_index in kf.split(X):
            train_X = filter_index(X, train_index)
            train_Y = filter_index(Y, train_index)
            test_X = filter_index(X, test_index)
            test_Y = filter_index(Y, test_index)
            test_classifier(suppl, train_X, train_Y, test_X, test_Y, addX, addY, metrics)
        accuracies = [x[0] for x in metrics]
        f1_scores = [x[1] for x in metrics]
        f1_clf[str(suppl())] = average(f1_scores)
        print "{}/{}".format(idx + 1, len(suppliers))
    #for k, v in f1_clf.iteritems():
    #    print k, v
    best_clf = max(f1_clf, key=f1_clf.get)
    return (best_clf, f1_clf[best_clf])

RESULT_FORMAT = "Max F: {}, func: {}, best classifier: {}"

for max_f in range(500, 1000, 100):
    for feature_getter in [get_XY_char_ngrams, get_XY_word_ngrams]:
        X, Y = feature_getter(tweets, max_features=max_f)
        addX, addY = feature_getter(add_tweets, max_features=max_f)
        for x in [X, addX]:
            x = add_features(x, tweets, EMOTION)
        print RESULT_FORMAT.format(max_f, feature_getter.func_name, fit_predict(X, Y, addX, addY))

['MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)', "AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n          learning_rate=1.0, n_estimators=50, random_state=None)", "DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n            max_features=None, max_leaf_nodes=None,\n            min_impurity_decrease=0.0, min_impurity_split=None,\n            min_samples_leaf=1, min_samples_split=2,\n            min_weight_fraction_leaf=0.0, presort=False, random_state=None,\n            splitter='best')", 'Ordinal Classifier(MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True))', "Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,\n          learning_rate=1.0, n_estimators=50, random_state=None))", "Ordinal Classifier(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,\n            max_features=None, max_leaf_nodes=None,\n            min_impurity_decrease=0.0, min_impurity_split=None,\n  

  'precision', 'predicted', average, warn_for)


5/9
6/9
7/9
8/9
9/9
LogisticAT(alpha=1.0, max_iter=1000000, verbose=0) 0.284587356463
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None) 0.379934269044
LogisticSE(alpha=1.0, max_iter=1000000, verbose=0) 0.293484803793
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best') 0.370733170416
Ordinal Classifier(MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)) 0.224453047872
MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True) 0.256561416518
Ordinal Classifier(AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)) 0.246829992094
Ordinal Class