In [1]:
import csv
import collections

from argparse import ArgumentParser
from sklearn.model_selection import train_test_split
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os

In [2]:
labels = []
texts = []

# read full dataset file
with open("../articles.csv",  "r", encoding='utf-8',) as csvfile:
    reader = csv.reader(csvfile, delimiter=';', quotechar='\'')
    for row in reader:
        labels.append(row[0])
        texts.append(row[1])

tr_Df=pd.concat([pd.Series(texts, name="Text"), pd.Series(labels,name='Label')], axis=1)

features_path="features"
if not os.path.exists(features_path):
    os.mkdir(features_path)
tr_Df.Label.to_csv(features_path+os.sep+"y_labels.out")

# split dataset
#X_train, X_test, y_train, y_test = train_test_split( tr_Df['Text'], tr_Df['Label'], test_size=SPLIT, random_state=42,stratify=tr_Df['Label'])

# write train and test datasets
#train=pd.DataFrame(pd.concat([X_train,y_train],axis=1))
#test=pd.DataFrame(pd.concat([X_test,y_test],axis=1))


In [4]:
#train

Unnamed: 0,Text,Label
9321,Die Energiewirtschaft hat ihre Strategie bis 2...,Wirtschaft
5703,"Östereich siegte in Podgorica zuerst gegen 12,...",Sport
8630,Sagis Vertreter Phillip Burns und Barry Gilber...,Wirtschaft
44,Keine offizielle Bestätigung über Verhandlungs...,Etat
537,"Roland Düringer in Autorevue TV, vom Leiden de...",Etat
...,...,...
2184,Auch Einreise- und Vermögenssperren gegen Luka...,International
5236,In der chinesischen Hauptstadt fahren Österrei...,Panorama
220,"Strache will weltoffen sein, atmete man in dem...",Etat
5186,25-Jähriger wollte TV-Sender mit Sprengstoffgü...,Panorama


## Fasttext

In [4]:
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_multiple_whitespaces

#### Download fasttext model 

From: https://fasttext.cc/docs/en/aligned-vectors.html

Download German aligned vectors (filename: wiki.de.align.vec)
in 'Notebooks/models/' folder.



In [5]:
embeddings = KeyedVectors.load_word2vec_format('./models/wiki.de.align.vec')

In [6]:
stopwords = ["der","die","das","den","des","dem"]

In [7]:
def tokenize(text):
    """Tokenizes the provided text
    Args:
        text (str): The text to be tokenized
    Returns:
        list(tuple(str, int)): A list of (token, count) pairs from the text without the stopwords.
    """

    # make everything lowercase and strip punctuation
    CUSTOM_FILTERS = [lambda x: x.lower(), strip_punctuation, strip_multiple_whitespaces]
    tokens = preprocess_string(text, CUSTOM_FILTERS)

    # filter out all stopwords
    filtered_tokens = [w for w in tokens if not w in stopwords]

    # return the filtered tokens
    return filtered_tokens


In [8]:
def text_embedding(text):
    """Create the text embedding
    Args:
        text (str): The text to be embedded
    Returns:
        list(float): The array of values representing the text embedding
    """

    # prepare the embedding placeholder
    embedding = np.zeros(embeddings.vector_size, dtype=np.float32)

    if text is None:
        # return the default embedding in a vanilla python object
        return embedding

    # get the text terms with frequencies
    tokens = tokenize(text)
    # iterate through the terms and count the number of terms
    count = 0
    for token in tokens:
        # sum all token embeddings of the vector
        if token in embeddings.vocab.keys():
            embedding += embeddings[token]
            count += 1

    if count == 0:
        # return the empty embedding list
        return embedding.tolist()

    # average the embedding
    embedding = embedding / count


    # return the embedding in vanilla python object
    return embedding

### Calculate Fasttext embeddings

In [9]:
sentences_embeddings=[]
for ind in tr_Df.index.values:
    sentences_embeddings.append(text_embedding(tr_Df.Text.loc[ind]))

sentences_embeddings=np.stack( sentences_embeddings, axis=0 )


In [29]:

if not os.path.exists(features_path):
    os.mkdir(features_path)
np.savetxt(features_path+os.sep+"Fasttext_embeddings.out", sentences_embeddings, delimiter=",")

In [11]:
y_labels=tr_Df.Label

### Classification using FastText features

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [None]:
TEST_SPLIT = .1
N_FOLDS=10
path="results_with_scaling_default"
if not os.path.exists(path):
    os.mkdir(path)
y_labels=pd.read_csv(features_path+os.sep+"y_labels.out", index_col=0, header=None) 
settings="fasttext"
sentences_embeddings = np.genfromtxt(features_path+os.sep+"Fasttext_embeddings.out", delimiter=',')
kf=StratifiedKFold(n_splits=N_FOLDS)

labels=y_labels.iloc[:,0].unique()
for c in ["knn", "gsvm"]:#,"logreg"]:
    kf=StratifiedKFold(n_splits=N_FOLDS)
    conf_mat=np.zeros((9,9))
    f1s=0
    accs=0
    scaler = MinMaxScaler()
    for tr_ind, te_ind in kf.split(sentences_embeddings, y_labels):
        X_tr_fold=sentences_embeddings[tr_ind]
        X_te_fold=sentences_embeddings[te_ind]
        y_tr_fold=y_labels.values[tr_ind]
        y_te_fold=y_labels.values[te_ind]
        X_tr_fold=scaler.fit_transform(X_tr_fold)
        X_te_fold=scaler.transform(X_te_fold)

        if c=="logreg":
            clf=LogisticRegression(random_state=1)
        elif c=="rf":
            clf=RandomForestClassifier(random_state=1)
        elif c=="gsvm":
            clf = SVC(
            kernel='rbf',random_state=123456)
        elif c=="knn":
            clf = KNeighborsClassifier()
        elif c=="svm":
            clf = SVC(kernel='linear' random_state=123456)
        clf.fit(X_tr_fold,y_tr_fold)
        #X_te_fold=fe.transform(X_te_fold)
        y_te_pred=clf.predict(X_te_fold)
        f1=f1_score(y_te_pred,y_te_fold, average='macro')
        print(f1)
        f1s+=f1 
        accs+=accuracy_score(y_te_pred,y_te_fold)
        conf_mat+=confusion_matrix(y_te_fold,y_te_pred, labels=labels)
    np.savetxt(path+os.sep+"conf_mat_folds_"+settings+"_"+c+".out", conf_mat, delimiter=",")
    file = open(path+os.sep+"f1s_folds_"+settings+"_"+c+".out",'w')  # w : writing mode  /  r : reading mode  /  a  :  appending mode
    file.write('f1, {}\n'.format(f1s/10))
    file.write('acc, {}\n'.format(accs/10))
    file.close()

## Bag-of-words representation

In [5]:
import re
def micro_tokenize(txt):
    words = []
    # split at whitespace
    for w in txt.split():
        w = w.strip('.,!?:;"-+()„“”»«…\'`’*')
        # words need to contain at least one "regular" character
        if re.search(r'[a-zöüßA-ZÄÖÜ]', w):
            words.append(w)
    return words

def normalize(txt):
    txt = txt.lower()

    # replace URLs
    url_re1 = re.compile(r'(?:ftp|http)s?://[\w\d:#@%/;$()~_?+=\,.&#!|-]+')
    txt = url_re1.sub('URL', txt)
    url_re2 = re.compile(r'\bwww\.[a-zA-Z0-9-]{2,63}\.[\w\d:#@%/;$()~_?+=\,.&#!|-]+')
    txt = url_re2.sub('URL', txt)
    url_re3 = re.compile(r'\b[a-zA-Z0-9.]+\.(?:com|org|net|io)')
    txt = url_re3.sub('URL', txt)
    
    # remove repeated symbols
    for s in ',.!?:;#-_=+*/$@%<>&()[]':
        txt = re.sub('[%s]+' % s, s, txt)

    # separate punctuation
    txt = re.sub(r'([.,!?:;/()\'"„“”»«`’…$%*])', r' \1 ', txt)

    # remove leading, trailing and repeated whitespace
    txt = txt.strip()
    txt = re.sub(r'\s+', ' ', txt)

    return txt

### Classification using BoW

In [None]:
TEST_SPLIT = .1
N_FOLDS=10
path="results_with_scaling_default"
if not os.path.exists(path):
    os.mkdir(path)
y_labels=pd.read_csv(features_path+os.sep+"y_labels.out", index_col=0, header=None) 
settings="counts"
#sentences_embeddings = np.genfromtxt("Fasttext_embeddings.out", delimiter=',')
sentences_embeddings=tr_Df.Text
kf=StratifiedKFold(n_splits=N_FOLDS)

labels=y_labels.iloc[:,0].unique()
for c in ["gsvm", "knn","svm","logreg", "nb","rf"]:
    kf=StratifiedKFold(n_splits=N_FOLDS)
    conf_mat=np.zeros((9,9))
    f1s=0
    accs=0
    scaler = MinMaxScaler()
    for tr_ind, te_ind in kf.split(sentences_embeddings, y_labels):
        X_tr_fold=sentences_embeddings[tr_ind]
        X_te_fold=sentences_embeddings[te_ind]
        y_tr_fold=y_labels.values[tr_ind]
        y_te_fold=y_labels.values[te_ind]
        #X_tr_fold=scaler.fit_transform(X_tr_fold)
        #X_te_fold=scaler.transform(X_te_fold)
        
        fe = CountVectorizer(
            preprocessor=normalize,
            tokenizer=micro_tokenize,
            binary=True,
        )
        fe.fit(X_tr_fold)
        X_tr_fold = fe.transform(X_tr_fold)

        if c=="logreg":
            clf=LogisticRegression(random_state=1)
        elif c=="rf":
            clf=RandomForestClassifier(random_state=1)
        elif c=="svm":
            clf = SVC(
            kernel='linear',random_state=123456)
        elif c=="gsvm":
            clf = SVC(
            kernel='rbf',random_state=123456)
        elif c=="knn":
            clf = KNeighborsClassifier()
        elif c=="nb":
            clf=MultinomialNB()
        clf.fit(X_tr_fold,y_tr_fold)
        X_te_fold=fe.transform(X_te_fold)
        y_te_pred=clf.predict(X_te_fold)
        f1=f1_score(y_te_pred,y_te_fold, average='macro')
        print(f1)
        f1s+=f1 
        accs+=accuracy_score(y_te_pred,y_te_fold)
        conf_mat+=confusion_matrix(y_te_fold,y_te_pred, labels=labels)
    np.savetxt(path+os.sep+"conf_mat_folds_"+settings+"_"+c+".out", conf_mat, delimiter=",")
    file = open(path+os.sep+"f1s_folds_"+settings+"_"+c+".out",'w')  # w : writing mode  /  r : reading mode  /  a  :  appending mode
    file.write('f1, {}\n'.format(f1s/10))
    file.write('acc, {}\n'.format(accs/10))
    file.close()