In [14]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

import numpy as np
from os import walk
from os import listdir
from os.path import isfile, join

import gensim

import random

In [2]:
def get_afinn(filename):
    with open(filename) as w:
        lines = [line.split('\t') for line in w]
    afinn = {}
    for i, m in enumerate(lines):
        afinn[m[0]] = int(m[1])
    return afinn

def line_sentiment(line, afinn):
    words = line.lower().split()
    i = 0
    for each in words:
        i += afinn.get(each, 0)
    return i

def get_data(path, path2):
    filenames = []
    data = {}
    for root, dirs, files in walk(path):
        for name in dirs:
            pth = '//'.join((root, name))
            filenames.append([(pth+'//'+f, f) for f in listdir(pth) if isfile(pth+'//'+f)])
    for i in range(len(filenames[0])):
        label = filenames[0][i][1].rsplit('.',1)[0]
        for j in range(len(filenames)):
            with open(filenames[j][i][0]) as f:
                if j:
                    data[label] += [line.split('\n')[0] for line in f]
                else:
                    data[label] = [line.split('\n')[0] for line in f]
    pom = {}
    filenames = []
    for root, dirs, files in walk(path2):
        for name in dirs:
            if name == 'txt.parag':
                pth = '//'.join((root, name))
                filenames += [(pth+'//'+f, f) for f in listdir(pth) if isfile(pth+'//'+f)]

    for i in range(len(filenames)):
        with open(filenames[i][0]) as f:
            pom[filenames[i][1].split('.')[0]] = ' '.join([line.split('\n')[0] for line in f])
    for i, m in enumerate(data.get('id')):
        if i > 0:
            data['full'].append(pom.get(m))
        else:
            pom.get(m)
            data['full'] = [pom.get(m)]
    return data

def split(data, count, ratio=0.8):
    test = {}
    train = {}
    keys = data.keys()
    
    shuffle = list(range(0,count))
    random.shuffle(shuffle)
    train_size = int(count * ratio)
    test_size = count - train_size
    
    for i in shuffle[:train_size]:
        for k in keys:
            if k in train:
                train[k].append(data[k][i])
            else:
                train[k] = [data[k][i]]

    for i in shuffle[-test_size:]:
        for k in keys:
            if k in test:
                test[k].append(data[k][i])
            else:
                test[k] = [data[k][i]]
    return test, train

In [3]:
afinn = get_afinn("AFINN-111.txt")
afinn_keys = [k for k in afinn]
afinn_weights = np.array([afinn.get(key) for key in afinn_keys])

In [4]:
data = get_data('scaledata//scaledata//', 'scaledata//scale_whole_review//')
#  id, label.3class, label.4class, rating, subj

In [5]:
test, train = split(data, len(data['id']))

In [6]:
def preprocess(data, label):
    x = []
    y = []
    wnl = WordNetLemmatizer()
    all_words = []
    stopw = set(stopwords.words('english'))
    
    x = [' '.join([wnl.lemmatize(w) for w in word_tokenize(d.lower()) if w not in stopw]) for d in data]
    y = [int(l) for l in label]
    for d in x:
        for w in d.split():
            all_words.append(w)
    return all_words, x, y

def featurecount(data):
    d = set(data.split())
    pom = {}
    for f in features:
        if f in d:
            pom[f] = True
        else:
            pom[f] = False
    return pom

In [7]:
all_words_train, x_train, y_train = preprocess(train['full'], train['label.3class'])
all_words_test, x_test, y_test = preprocess(test['full'], test['label.3class'])

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

CV = CountVectorizer(vocabulary=afinn_keys, analyzer='word')
x_train_vec_cv = CV.fit_transform(x_train)
x_test_vec_cv = CV.transform(x_test)

for i, m in enumerate(afinn_weights):
    x_train_vec_cv[:,i] = x_train_vec_cv[:,i]*m
    x_test_vec_cv[:,i] = x_test_vec_cv[:,i]*m

In [24]:
from sklearn.ensemble import RandomForestClassifier
print('AFINN - True, TFIDF - False')
forest = RandomForestClassifier()
forest.fit(x_train_vec_cv.toarray(), y_train)
print('Random forest classifier:', forest.score(x_test_vec_cv.toarray(), y_test))

from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train_vec_cv.toarray(), y_train)
print('Support Vector classifier:', svc.score(x_test_vec_cv.toarray(), y_test))

from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(x_train_vec_cv.toarray(), y_train)
print('Linear Support Vector classifier:', lsvc.score(x_test_vec_cv.toarray(), y_test))

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_vec_cv.toarray(), y_train)
print('Logistic Regression classifier:', lr.score(x_test_vec_cv.toarray(), y_test))

AFINN - True, TFIDF - False
Random forest classifier: 0.501996007984
Support Vector classifier: 0.581836327345
Linear Support Vector classifier: 0.532934131737
Logistic Regression classifier: 0.559880239521


In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor

TV = TfidfVectorizer(analyzer='word', min_df=0.05, max_df=0.9)
x_train_vec_tv = TV.fit_transform(x_train)
x_test_vec_tv = TV.transform(x_test)

In [32]:
from sklearn.ensemble import RandomForestClassifier
print('AFINN - False, TFIDF - True')
forest = RandomForestClassifier()
forest.fit(x_train_vec_tv.toarray(), y_train)
print('Random forest classifier:', forest.score(x_test_vec_tv.toarray(), y_test))

from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train_vec_tv.toarray(), y_train)
print('C-Support Vector classifier:', svc.score(x_test_vec_tv.toarray(), y_test))

from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(x_train_vec_tv.toarray(), y_train)
print('Linear Support Vector classifier:', lsvc.score(x_test_vec_tv.toarray(), y_test))

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_vec_tv.toarray(), y_train)
print('Logistic Regression classifier:', lr.score(x_test_vec_tv.toarray(), y_test))

AFINN - False, TFIDF - True
Random forest classifier: 0.482035928144
C-Support Vector classifier: 0.37624750499
Linear Support Vector classifier: 0.631736526946
Logistic Regression classifier: 0.62375249501


In [33]:
from sklearn.feature_extraction.text import CountVectorizer

CV = CountVectorizer(analyzer='word', min_df=0.05, max_df=0.9)
x_train_vec_cv2 = CV.fit_transform(x_train)
x_test_vec_cv2 = CV.transform(x_test)

In [34]:
from sklearn.ensemble import RandomForestClassifier
print('AFINN - False, TFIDF - False')
forest = RandomForestClassifier()
forest.fit(x_train_vec_cv2.toarray(), y_train)
print('Random forest classifier:', forest.score(x_test_vec_cv2.toarray(), y_test))

from sklearn.svm import SVC

svc = SVC()
svc.fit(x_train_vec_cv2.toarray(), y_train)
print('Support Vector classifier:', svc.score(x_test_vec_cv2.toarray(), y_test))

from sklearn.svm import LinearSVC

lsvc = LinearSVC()
lsvc.fit(x_train_vec_cv2.toarray(), y_train)
print('Linear Support Vector classifier:', lsvc.score(x_test_vec_cv2.toarray(), y_test))

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(x_train_vec_cv2.toarray(), y_train)
print('Logistic Regression classifier:', lr.score(x_test_vec_cv2.toarray(), y_test))

AFINN - False, TFIDF - False
Random forest classifier: 0.494011976048
Support Vector classifier: 0.634730538922
Linear Support Vector classifier: 0.588822355289
Logistic Regression classifier: 0.597804391218


In [12]:
frequent_words = nltk.FreqDist(all_words_train)
features = list(sorted(frequent_words, key=frequent_words.get, reverse=True))[:5000]

In [30]:
from nltk import NaiveBayesClassifier

train_set = [(featurecount(m), y_train[i]) for i, m in enumerate(x_train)]
test_set = [(featurecount(m), y_test[i]) for i, m in enumerate(x_test)]

NBC = NaiveBayesClassifier.train(train_set)
print('NaiveBayesClassifier score:', nltk.classify.accuracy(NBC, test_set))

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB

MNB = SklearnClassifier(MultinomialNB())
MNB.train(train_set)
print('MultinomialNB score:', nltk.classify.accuracy(MNB, test_set))

NaiveBayesClassifier score: 0.6127744510978044
MultinomialNB score: 0.607784431138


In [52]:
def voting(x_train, y_train, x_test, y_test, all_words_train, afinn_keys, afinn_weights):
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import LinearSVC
    from sklearn.svm import SVC
    from nltk import NaiveBayesClassifier
    from nltk.classify.scikitlearn import SklearnClassifier
    from sklearn.naive_bayes import MultinomialNB
    
    # Sklearn classifiers
    lr_tv = LogisticRegression()
    lr_cv = LogisticRegression()
    svc = SVC()
    lsvc_cv = LinearSVC()
    lsvc_tv = LinearSVC()
    
    # Vectorization
    from sklearn.feature_extraction.text import TfidfVectorizer
    TV = TfidfVectorizer(analyzer='word', min_df=0.05, max_df=0.8)
    x_train_vec_tv = TV.fit_transform(x_train)
    x_test_vec_tv = TV.transform(x_test)
    
    from sklearn.feature_extraction.text import CountVectorizer

    CV = CountVectorizer(vocabulary=afinn_keys, analyzer='word')
    x_train_vec_cv = CV.fit_transform(x_train)
    x_test_vec_cv = CV.transform(x_test)

    for i, m in enumerate(afinn_weights):
        x_train_vec_cv[:,i] = x_train_vec_cv[:,i]*m
        x_test_vec_cv[:,i] = x_test_vec_cv[:,i]*m
    
    
    # Training
    lsvc_cv.fit(x_train_vec_cv.toarray(), y_train)    
    lsvc_tv.fit(x_train_vec_tv.toarray(), y_train)    
    lr_tv.fit(x_train_vec_tv.toarray(), y_train)
    lr_cv.fit(x_train_vec_cv.toarray(), y_train)
    svc.fit(x_train_vec_cv.toarray(), y_train)
    
    # Naive Bayes training
    frequent_words = nltk.FreqDist(all_words_train)
    features = list(sorted(frequent_words, key=frequent_words.get, reverse=True))[:5000]
    
    train_set = [(featurecount(m), y_train[i]) for i, m in enumerate(x_train)]
    test_set = [(featurecount(m), y_test[i]) for i, m in enumerate(x_test)]
    
    NBC = NaiveBayesClassifier.train(train_set)
    MNB = SklearnClassifier(MultinomialNB())
    MNB.train(train_set)
    
    # Vote generating
    lst = []
    lst.append(lsvc_cv.predict(x_test_vec_cv.toarray()))   
    lst.append(lsvc_tv.predict(x_test_vec_tv.toarray()))    
    lst.append(lr_tv.predict(x_test_vec_tv.toarray()))
    lst.append(lr_cv.predict(x_test_vec_cv.toarray()))
    lst.append(svc.predict(x_test_vec_cv.toarray()))
    lst.append([MNB.classify(test_set[i][0]) for i in range(len(test_set))])
    lst.append([NBC.classify(test_set[i][0]) for i in range(len(test_set))])
    return lst

In [53]:
final = voting(x_train, y_train, x_test, y_test, all_words_train, afinn_keys, afinn_weights)

In [119]:
def most_common(lst):
    return max(set(lst), key=lst.count)

def vote_count(final):
    fin = []
    for i in range(len(final[0])):
        pom = []
        for each in final:
            pom.append(each[i])
        #fin.append(most_common(pom))
        fin.append(int(round(sum(pom)/len(final)-0.00001)))
    return fin

def voting_score(y_test, final):
    from sklearn.metrics import accuracy_score
    pom = vote_count(final)
    return accuracy_score(y_test, pom)

In [120]:
def voting_score(y_test, final):
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import f1_score
    
    pom = vote_count(final)
    
    return accuracy_score(y_test, pom), f1_score(y_test, pom, average='weighted')

In [121]:
accu, f1 = voting_score(y_test, final)

In [122]:
print("%.5f" % accu)

0.65569


In [123]:
print("%.5f" % f1)

0.65508


In [96]:
pom = vote_count(final)

In [128]:
print(len([1 for i in pom if i == 0]),
      len([1 for i in pom if i == 1]),
      len([1 for i in pom if i == 2]))
print(len([1 for i in y_test if i == 0]),
      len([1 for i in y_test if i == 1]),
      len([1 for i in y_test if i == 2]))
print()
print('Slight bias against negative sentiment can be seen. This could be expected, due to an imbalance in the data set as a whole.')
print()
print('Also, due to the nature of the voting system implemented, neutral values have a large bias.')
print('We could probably get more meaningful results by implementing a confidence measure,')
print('which would allow us to asses towards which sentiment the neutral documents lean.')

158 505 339
247 377 378

Slight bias against negative sentiment can be seen. This could be expected, due to an imbalance in the data set as a whole.

Also, due to the nature of the voting system implemented, neutral values have a large bias.
We could probably get more meaningful results by implementing a confidence measure,
which would allow us to asses towards which sentiment the neutral documents lean.
