In [1]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from nltk.corpus import stopwords as sw
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from pandas import DataFrame

stopwords = sw.words('german')

# gensims LineSentence generator replaces umlauts with 
# u, a or o so add these variants to the stopwordlist
for stopword in stopwords:
    stopword = stopword.replace(u'ü', 'u')
    stopword = stopword.replace(u'ö', 'o')
    stopword = stopword.replace(u'ä', 'a')
    if stopword not in stopwords:
        stopwords.append(stopword)



In [2]:
category_names = ['Sonstiges', 'Lifestyle', 
          'Wirtschaft', 'Finanzen', 'Lokal', 
          'Politik', 'Sport', 'Technologie', 'Kultur']

num_models = len(category_names)

# the list of split corpora
train_paths = [(x, "data/corpus{}.training.txt".format(x)) for x in category_names]
validation_paths = [(x, "data/corpus{}.validation.txt".format(x)) for x in category_names]

In [3]:
def load_sets(paths):
    X, y = [], []

    for name, path in paths:
        with open(path) as cur_file:
            for line in cur_file:
                tokens = [x for x in line.split() if x not in stopwords]
                if len(tokens) > 0:
                    X.append(tokens)
                    y.append(name)
    print("loaded {} articles".format(len(X)))
    return X, y

In [4]:
train_X, train_y = load_sets(train_paths)



loaded 26689 articles


In [5]:
mult_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
mult_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("multinomial nb", MultinomialNB())])
bern_nb_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("bernoulli nb", BernoulliNB())])
# SVM - which is supposed to be more or less state of the art 
# http://www.cs.cornell.edu/people/tj/publications/joachims_98a.pdf
svc = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])
svc_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("linear svc", SVC(kernel="linear"))])

# random forrest classifier, since they are also used while validating the word2doc classification startegy
rf = Pipeline([("count_vectorizer", CountVectorizer(analyzer=lambda x: x)), ("random forrest", RandomForestClassifier(n_estimators=200))])
rf_tfidf = Pipeline([("tfidf_vectorizer", TfidfVectorizer(analyzer=lambda x: x)), ("random forrest", RandomForestClassifier(n_estimators=200))])

In [6]:
all_models = [
    ("mult_nb", mult_nb),
    ("mult_nb_tfidf", mult_nb_tfidf),
    ("bern_nb", bern_nb),
    ("bern_nb_tfidf", bern_nb_tfidf),
    ("svc", svc),
    ("svc_tfidf", svc_tfidf),
    ("rf", rf), 
    ("rf_tfidf", rf_tfidf)
]

for name, model in all_models:
    print("now training {}".format(name))
    model.fit(train_X, train_y)
  
# do not use cross validation, because the other models are only validated with the single train/validation split
# this way we make sure the scores really are compareable
# cross validation code below:
#scores = sorted([[name, cross_val_score(model, X, y, cv=5).mean()] 
#                 for name, model in all_models], 
#                key=lambda (_, x): -x)

now training mult_nb
now training mult_nb_tfidf
now training bern_nb
now training bern_nb_tfidf
now training svc
now training svc_tfidf
now training rf
now training rf_tfidf


In [7]:
scores = []
validate_X, validate_y = load_sets(validation_paths)

for name, model in all_models:
    print("predicting using {}".format(name))
    predicted_y = model.predict(validate_X)
    score = accuracy_score(validate_y, predicted_y, normalize=True)
    scores.append((name, score))



loaded 2959 articles
predicting using mult_nb
predicting using mult_nb_tfidf
predicting using bern_nb
predicting using bern_nb_tfidf
predicting using svc
predicting using svc_tfidf
predicting using rf
predicting using rf_tfidf


In [8]:
result = DataFrame(scores, None, ("model", 'score'))
print(result)  


           model     score
0        mult_nb  0.732004
1  mult_nb_tfidf  0.492396
2        bern_nb  0.625887
3  bern_nb_tfidf  0.625887
4            svc  0.743156
5      svc_tfidf  0.784049
6             rf  0.720514
7       rf_tfidf  0.718824
