In [1]:
import os, nltk, codecs
from nltk import word_tokenize
from time import time
import sys


# to use my own preprocessing in the vectorizer later
def my_dummy(doc):
    return doc

In [2]:
#Load Data to pandas frame vor better visualization
import pandas as pd
completeData = pd.read_csv('OMTGeschichten1_bearb.txt', sep = ";",
                            names = ["#","Answers", "Class", "Subclass"
                                    ]
                           )
shortnedData = pd.read_csv('OMT_shortned.txt', sep = ";",
                            names = ["#","Answers", "Class", "Subclass"
                                    ]
                           )

print('before cleaning...\n')
print(completeData.info(), "\n")
print(completeData["Class"].value_counts(), "\n")
#delete eventually empty entries
completeData = completeData.dropna(axis = 0)
shortnedData = shortnedData.dropna(axis = 0)

#delete eventually wrong labels ('\N' seems to accure from time to time)
completeData = completeData.drop(completeData[completeData.Class == '\\N'].index).reset_index()
shortnedData = shortnedData.drop(shortnedData[shortnedData.Class == '\\N'].index).reset_index()
print('after cleaning...\n')
print(completeData.info())
print(completeData["Class"].value_counts(), "\n")
print(completeData.head)

before cleaning...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209713 entries, 0 to 209712
Data columns (total 4 columns):
#           209713 non-null object
Answers     209713 non-null object
Class       209713 non-null object
Subclass    209713 non-null object
dtypes: object(4)
memory usage: 6.4+ MB
None 

M     85879
L     40960
F     37558
A     35385
0      9918
\N       13
Name: Class, dtype: int64 

after cleaning...

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209700 entries, 0 to 209699
Data columns (total 5 columns):
index       209700 non-null int64
#           209700 non-null object
Answers     209700 non-null object
Class       209700 non-null object
Subclass    209700 non-null object
dtypes: int64(1), object(4)
memory usage: 8.0+ MB
None
M    85879
L    40960
F    37558
A    35385
0     9918
Name: Class, dtype: int64 

<bound method NDFrame.head of          index                             #  \
0            0      185124138106620081063215   
1            1  

In [4]:
from sklearn.model_selection import train_test_split

# extract the needed infos from data_complete
data = completeData.filter(["Answers", "Class"],axis = 1)
dataShortned = shortnedData.filter(["Answers", "Class"],axis = 1)

#Motive F is basicly a sub-motive from M and therefore not further used as indipendent motive
data["Class"] = data["Class"].replace(["F"],["M"])
dataShortned["class"] = dataShortned["Class"].replace(["F"], ["M"])
#split it into X ( = input) and y ( the labels)
X, y = data["Answers"], data["Class"]
XShortned, yShortned = dataShortned["Answers"], dataShortned["Class"]

#split data into train and test set, stratified and a 90:10 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size = 0.1, stratify = y)
X_train_shortned, X_test_shortned, y_train_shortned, y_test_shortned = train_test_split(XShortned, yShortned , test_size = 0.2, random_state = 42,)

print('Infos from X_train... \n\n')
print(X_train.describe())
print('\n\n\n')
print('Infos from X_test... \n\n')
print(X_test.describe())

Infos from X_train... 


count                                      188730
unique                                     188524
top       keine ahnung.keine ahnung.keine ahnung.
freq                                           10
Name: Answers, dtype: object




Infos from X_test... 


count                               20970
unique                              20968
top       ruhe.unter druck.andere person.
freq                                    2
Name: Answers, dtype: object


In [5]:
#preprocessing
import re
from nltk.corpus import stopwords
import numpy as np

# The pattern I may don't want in my corpus 
pattern = re.compile("\-|\!|\:|\.|\d|\,|\:|\(|\)|\?|\"|\\|\+|\%|\/")
def remove_patterns_from_string(strg):
    return re.sub(pattern, " ", strg)

# transforms pattern like xyz.Abc to xyz. Abc (--> helps tokenizer to detect sentences)
def insert_space_between_special_chars(strg):
    return re.sub('([a-zA-Z])([^A-Za-z0-9]+)([a-zA-Z])', r'\1\2 \3', strg)

# german stopwords (words who accure very often in texts and don't contribute anything to the text like articles, conjunctions)
stopWords = set(stopwords.words("german"))

#new spellings
stopWords.add('dass')

#tokenizes a list of strings
#return: list of tokens
def tokenize_text(text):
    tokens = nltk.word_tokenize(text, language = 'german')
    return tokens

#retrns: list of strings which do not occure in stopWords
def remove_stopwords(text):
    return [w for w in text if not w in stopWords]

# in : row for row from vector
# out: preprocessed version of the row
def preprocess_text(corpus):
    normalized_corpus = []
    for text in corpus:
#to change xyz.abc to xyz. abc
        text = re.sub('([a-zA-Z])(\.)([a-zA-Z])', r'\1\2 \3', text)
        #text = text.lower()
        #text = remove_patterns_from_string(text)
        text = tokenize_text(text)
        #text = remove_stopwords(text)
        normalized_corpus.append(text)
    return normalized_corpus

#preprocess only X_train. X_test gets its own pipeline in the end
X_train_preprocessed = preprocess_text(X_train)
X_train_shortned_preprocessed = preprocess_text(X_train_shortned)
print(X_train_preprocessed[:5])
print(X_train_shortned_preprocessed[:3])


[['Der', 'Trainer', 'der', 'Fußballmannschaft', 'erklärt', 'den', 'Spielern', 'die', 'Taktik', 'für', 'das', 'nächste', 'Spiel', ',', 'ihm', 'ist', 'wichtig', ',', 'dass', 'alle', 'zuhören', '.', 'die', 'Person', 'fühlt', 'sich', 'als', 'Fußballlehrer', 'und', 'als', 'derjenige', ',', 'der', 'den', 'Jungs', 'etwas', 'beibringen', 'möchte', '.', 'er', 'wahrt', 'die', 'Distanz', ',', 'macht', 'den', 'Unterschied', 'zu', 'den', 'Jungs', ',', 'fühlt', 'sich', 'aber', 'für', 'sie', 'verantwortlich', '.'], ['Sie', 'möchte', 'ihre', 'persönliche', 'Meinung', 'gegnüber', 'den', 'Anhängern', 'einer', 'bestimmten', 'politischen', 'Partei', 'zum', 'Ausdruck', 'bringen', 'und', 'begibt', 'sich', 'deshalb', 'zu', 'einem', 'Parteitag', 'dieser', 'Partei', '.', 'Sie', 'fühlt', 'sich', 'gekränkt', 'von', 'dem', 'Parteiprogramm', 'der', 'Partei', 'und', 'ist', 'selbstbewusst', 'bei', 'der', 'Sache', ',', 'wenn', 'es', 'darum', 'geht', ',', 'die', 'eigene', 'Meinung', 'zu', 'äußern', '.', 'Die', 'Partei

In [6]:
from sklearn.metrics import classification_report
import pandas as pd

# minimal classification performance: all answers set to M ( ZeroN - Classifier)
m_labels = []
for i in range(len(y_test)):
    m_labels.append("M")
machtmotiv = pd.Series(m_labels)
print(classification_report(y_test, machtmotiv))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       992
           A       0.00      0.00      0.00      3538
           L       0.00      0.00      0.00      4096
           M       0.59      1.00      0.74     12344

   micro avg       0.59      0.59      0.59     20970
   macro avg       0.15      0.25      0.19     20970
weighted avg       0.35      0.59      0.44     20970



  'precision', 'predicted', average, warn_for)


In [40]:
# first classification on complete data; 
#using params, which seems best based on sample-sized classification with 1000 examples
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.base import clone

#for every fold, split first into new stratified train/dev set
    # vectorizer && classifier must implement the conditions of sklearns pipeline 
    #vectorizer must implement fit transform
    #classifier must implement fit
    
def kfold_f1_classification(vectorizer, classifier, trainingData, trainingLabel, folds=3):
    t0 = time()
    print('starting classification...')
    for j in range(folds):
        XTrain, XDev, yTrain, yDev = [],[],[],[]
        pipeline = []
        #split into 90:10 train test sets
        XTrain, XDev, yTrain, yDev = train_test_split(trainingData, trainingLabel, test_size= 0.1, stratify = y_train)
        pipeline = Pipeline([
            ('vect1', clone(vectorizer)),
                            
            ('clf', clone(classifier)),    
        ])
        pipeline.fit(XTrain, yTrain)
        predictions = pipeline.predict(XDev)
        print("classification ", j)
        print(classification_report(yDev, predictions))
        print()
        print()
    return("done in %0.3fs" % (time()-t0))


def kfold_f1_classification_pipe(pipe, classifier, trainingData, trainingLabel, folds=3):
    t0 = time()
    print('starting classification...')
    for j in range(folds):
        XTrain, XDev, yTrain, yDev = [],[],[],[]
        pipeline = []
        
        XTrain, XDev, yTrain, yDev = train_test_split(trainingData, trainingLabel, test_size= 0.1, stratify = y_train)
        pipeline = Pipeline([
            ('pipe', pipe),
                            
            ('clf', clone(classifier)),    
        ])
        pipeline.fit(XTrain, yTrain)
        predictions = pipeline.predict(XDev)
        print("classification ", j)
        print(classification_report(yDev, predictions))
        print()
        print()
    return("done in %0.3fs" % (time()-t0))




Here starts the first experiment trying to evaluate the OMT only by its answers with ML. 
Features are only the word occurances

In [23]:
kfold_f1_classification(vectorizer =CountVectorizer(ngram_range=(1,1),
                                                    preprocessor = my_dummy,
                                                    tokenizer =my_dummy, 
                                                    token_pattern=None,
                                                    max_features = 10000),
                        classifier =LinearSVC(C = 1, max_iter = 2000),
                        trainingData= X_train_preprocessed,
                        trainingLabel= y_train)
        



starting classification...




classification  0
              precision    recall  f1-score   support

           0       0.49      0.22      0.31       893
           A       0.80      0.75      0.78      3185
           L       0.81      0.76      0.78      3686
           M       0.84      0.91      0.88     11109

   micro avg       0.82      0.82      0.82     18873
   macro avg       0.74      0.66      0.68     18873
weighted avg       0.81      0.82      0.81     18873







classification  1
              precision    recall  f1-score   support

           0       0.48      0.21      0.30       893
           A       0.79      0.75      0.77      3185
           L       0.81      0.76      0.78      3686
           M       0.84      0.91      0.87     11109

   micro avg       0.82      0.82      0.82     18873
   macro avg       0.73      0.66      0.68     18873
weighted avg       0.81      0.82      0.81     18873







classification  2
              precision    recall  f1-score   support

           0       0.51      0.23      0.32       893
           A       0.81      0.75      0.78      3185
           L       0.80      0.76      0.78      3686
           M       0.84      0.91      0.87     11109

   micro avg       0.82      0.82      0.82     18873
   macro avg       0.74      0.66      0.69     18873
weighted avg       0.81      0.82      0.81     18873





'done in 542.204s'

In [44]:
# Only use LDA with different number of topics and SVM classifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn. decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


print("start classification with LDA - Features...")
#define pipeline combining feature extractor with classifier
pipelineLdaKnn = Pipeline([
    ('pipeLDA', Pipeline([
                                  ('vect_lda', CountVectorizer(preprocessor = my_dummy,
                                                               tokenizer =my_dummy, token_pattern=None)),
                                   ('lda', LatentDirichletAllocation(learning_method = 'batch'))
                              ])),
                            
    ('svm', LinearSVC(max_iter=1000)),    
])

parametersLdaKnn = {
    'pipeLDA__vect_lda__max_df': (0.9,),
    'pipeLDA__vect_lda__min_df': (10,),
    'pipeLDA__vect_lda__ngram_range':((1,1),),
    'pipeLDA__vect_lda__max_features': (3000,),
    'pipeLDA__lda__n_components': (10, 50,100,),
    }

if __name__ == "__main__":
    
    #find best params for this feature extraction methods
    gridSearchLdaKnn = GridSearchCV(pipelineLdaKnn, parametersLdaKnn, cv=3,
                           n_jobs = -1, verbose = 1)
    print("performing gridsearch...")
    print("pipeline:", [name for name, _ in pipelineLdaKnn.steps])
    print("parameters:")
    print(parametersLdaKnn)
    time_svc = time()
    
    XTrainHelp, XDevHelp, yTrainHelp, yDevHelp = train_test_split(X_train_preprocessed, y_train, test_size= 0.1, stratify = y_train)
    gridSearchLdaKnn.fit(XTrainHelp, yTrainHelp)
    print("done in %0.3fs" % (time()-time_svc))
    print()

    print("Best score: %0.3f" % gridSearchLdaKnn.best_score_)
    print("best parameters set:")
    best_parameters = gridSearchLdaKnn.best_estimator_.get_params()
    XDevHelpPredicted = gridSearchLdaKnn.predict(XDevHelp)
    
    print(classification_report(yDevHelp, XDevHelpPredicted))
    
    for param_name in sorted(parametersLdaKnn.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))


start classification with KNN and LDA - Features...
performing gridsearch...
pipeline: ['pipeLDA', 'svm']
parameters:
{'pipeLDA__vect_lda__max_df': (0.9,), 'pipeLDA__vect_lda__min_df': (10,), 'pipeLDA__vect_lda__ngram_range': ((1, 1),), 'pipeLDA__vect_lda__max_features': (3000,), 'pipeLDA__lda__n_components': (10, 50, 100)}
Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed: 17.1min remaining: 21.3min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 24.1min finished


done in 2032.057s

Best score: 0.765
best parameters set:
              precision    recall  f1-score   support

           0       0.82      0.03      0.05       893
           A       0.75      0.59      0.66      3185
           L       0.81      0.66      0.72      3686
           M       0.77      0.93      0.84     11109

   micro avg       0.78      0.78      0.78     18873
   macro avg       0.79      0.55      0.57     18873
weighted avg       0.78      0.78      0.75     18873

	pipeLDA__lda__n_components: 100
	pipeLDA__vect_lda__max_df: 0.9
	pipeLDA__vect_lda__max_features: 3000
	pipeLDA__vect_lda__min_df: 10
	pipeLDA__vect_lda__ngram_range: (1, 1)


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn. decomposition import LatentDirichletAllocation
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report


print("start classification with SVM, BOW and LDA - Features...")
#define pipeline combining feature extractor with classifier
pipelineBowLDA = Pipeline([
    ('union', FeatureUnion(transformer_list = [
                                        ('vect3', CountVectorizer(preprocessor = my_dummy,
                                                                  tokenizer =my_dummy, 
                                                                  token_pattern=None,
                                                                  max_features = 5000)),           
                                  
                                        ('pipe2', Pipeline([
                                              
                                            ('vect_lda', CountVectorizer(
                                                                preprocessor = my_dummy,
                                                                tokenizer =my_dummy, 
                                                                token_pattern=None,
                                                                max_features = 5000
                                                                        )),
                                            ('lda1', LatentDirichletAllocation(
                                                                               learning_method = 'batch',
                                                                               n_components = 250
                                                                               
                                                                      ))
                              ]))
                            ])),
                            
    ('svm', LinearSVC(max_iter=2000)),    
])

parametersLdaKnn = {}

if __name__ == "__main__":
    
    #find best params for this feature extraction methods
    gridSearchBowLdaSVM = GridSearchCV(pipelineBowLDA, parametersLdaKnn, cv=2,
                           n_jobs = -1, verbose = 1)
    print("performing gridsearch...")
    print("pipeline:", [name for name, _ in pipelineBowLDA.steps])
    print("parameters:")
    print(parametersLdaKnn)
    time_svc = time()
    
    XTrainHelp, XDevHelp, yTrainHelp, yDevHelp = train_test_split(X_train_preprocessed, y_train, test_size= 0.1, stratify = y_train)
    gridSearchBowLdaSVM.fit(XTrainHelp, yTrainHelp)
    print("done in %0.3fs" % (time()-time_svc))
    print()

    print("Best score: %0.3f" % gridSearchBowLdaSVM.best_score_)
    print("best parameters set:")
    best_parameters = gridSearchBowLdaSVM.best_estimator_.get_params()
    XDevHelpPredicted = gridSearchBowLdaSVM.predict(XDevHelp)
    
    print(classification_report(yDevHelp, XDevHelpPredicted))
    
    for param_name in sorted(parametersLdaKnn.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))





start classification with SVM, BOW and LDA - Features...
performing gridsearch...
pipeline: ['union', 'svm']
parameters:
{}
Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 13.5min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed: 13.5min finished


done in 2395.970s

Best score: 0.815
best parameters set:
              precision    recall  f1-score   support

           0       0.61      0.23      0.34       893
           A       0.78      0.74      0.76      3185
           L       0.81      0.75      0.78      3686
           M       0.84      0.91      0.87     11109

   micro avg       0.82      0.82      0.82     18873
   macro avg       0.76      0.66      0.69     18873
weighted avg       0.81      0.82      0.81     18873

