In [115]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [116]:
meta_train = load_files(r'D:\ML_TextMining\data_inspire\inspire-train', shuffle=True, encoding='utf-8', random_state=42)
meta_test = load_files(r'D:\ML_TextMining\data_inspire\inspire-test', shuffle=True, encoding='utf-8', random_state=42)

X_train = meta_train.data
y_train = meta_train.target
X_test = meta_test.data
y_test = meta_test.target

In [117]:
df = pd.read_csv("german_stopwords_plain.txt")
stopwords = df.loc[:,'#german_stopwords_plain.txt'].to_list()

In [118]:
count_vect = CountVectorizer(stop_words=stopwords)
X_train_counts = count_vect.fit_transform(X_train)

print('CountVectorizer Shape:', X_train_counts.shape)

CountVectorizer Shape: (72, 1677)


In [119]:
clf = MultinomialNB().fit(X_train_counts, y_train)

In [120]:
X_test_counts = count_vect.transform(X_test)
predicted = clf.predict(X_test_counts)

print('Test-Label True:', y_test)
print('Test-Label Pred:', predicted)
print('Test Score:', clf.score(X_test_counts, y_test))
print('Train Score:', clf.score(X_train_counts, y_train))

print('\nZuordnung Test-Dokumente:')
category = meta_test.target_names
for doc, category in zip(X_test, predicted):
    print('%r => %s' % (doc, meta_test.target_names[category]), '\n')

#probability test-documents for class:
df_proba = pd.DataFrame(clf.predict_proba(X_test_counts))
df_proba.to_csv(r"D:\ML_TextMining\probability_nb_document.csv", sep=';', index=True)

print('\nKlassifikations Report Testdaten:')
print(metrics.classification_report(y_test, predicted))
print('\nKlassifikations Report Trainingsdaten:')
print(metrics.classification_report(y_train, clf.predict(X_train_counts)))

Test-Label True: [ 8 12  6 23 12 12 12 12]
Test-Label Pred: [ 8 23 17 23 12 12  7 23]
Test Score: 0.5
Train Score: 0.9166666666666666

Zuordnung Test-Dokumente:
'Keywords:\r\nGeologie, Geologische Einheiten, Geologische Übersichtskarte 1:250.000, Karte, Vectordaten\r\n\r\nTypes:\r\ngeologische Karte 250000\r\n\r\nDesc:\r\nGeologische Karte 1:250 000:\r\nDie Geologische Karte 1:250 000 liegt digital vor. Sie wird laufend aktualisiert, als Geodatendienst veröffentlicht und gegebenenfalls in digitaler Form auch auf CD abgegeben.\r\n\r\nAnsprechpartner:\r\nBehörde für Umwelt und Energie (BUE)\r\nWasser, Abwasser und Geologie (W)\r\nGeologisches Landesamt\r\nGrundsatz, Geowissenschaftliche Landesaufnahme' => ge 

'Keywords:\r\nSpielplatz, Kinderspielplatz, Spielen, Point, Vectordaten\r\n\r\nTypes:\r\nspielplatz\r\n\r\nDesc:\r\nStandorte von Spielplätzen in Hamburg.\r\n\r\nAnsprechpartner:\r\n' => us-gov 

'Keywords:\r\nWasserkraftwerk, Wasserkraft, Energieerzeugung, Turbine, Point, Vectorda

In [121]:
#Exkurs Wahrscheinlichkeiten am Beispiel Test-Dokument 'Behoerden' der Klasse us-gov:
dfc = pd.DataFrame(clf.feature_count_, columns=count_vect.get_feature_names())
dfc.loc[:,'Summe'] = dfc.sum(axis=1)
dfc.loc[24] = dfc.loc[dfc.index.difference([23])].sum(axis=0)
dfd = dfc.loc[[23,24],['behörde','verwaltung','point','vectordaten','Summe']]

dfd.loc[:,'p_behörde'] = dfd.loc[:,'behörde'] / dfd.loc[:,'Summe']
dfd.loc[:,'p_verwaltung'] = dfd.loc[:,'verwaltung'] / dfd.loc[:,'Summe']
dfd.loc[:,'p_point'] = dfd.loc[:,'point'] / dfd.loc[:,'Summe']
dfd.loc[:,'p_vectordaten'] = dfd.loc[:,'vectordaten'] / dfd.loc[:,'Summe']

dfd.loc[25,'p_behörde'] = dfd.loc[23,'p_behörde'] / dfd.loc[24,'p_behörde']
dfd.loc[25,'p_verwaltung'] = dfd.loc[23,'p_verwaltung'] / dfd.loc[24,'p_verwaltung']
dfd.loc[25,'p_point'] = dfd.loc[23,'p_point'] / dfd.loc[24,'p_point']
dfd.loc[25,'p_vectordaten'] = dfd.loc[23,'p_vectordaten'] / dfd.loc[24,'p_vectordaten']

r = 7/165 * dfd.loc[25,'p_behörde'] * dfd.loc[25,'p_verwaltung'] * dfd.loc[25,'p_point'] * dfd.loc[25,'p_vectordaten']

print('\nWahrscheinlichkeitsverhaeltnis us-gov : rest [Zeile 25]\n', dfd)
print('\nChance us-gov pos:', r)
print('proba us-gov pos:', r/(r+1))


Wahrscheinlichkeitsverhaeltnis us-gov : rest [Zeile 25]
     behörde  verwaltung  point  vectordaten   Summe  p_behörde  p_verwaltung  \
23      7.0         1.0    7.0          7.0   226.0   0.030973      0.004425   
24     44.0         6.0   10.0         55.0  4613.0   0.009538      0.001301   
25      NaN         NaN    NaN          NaN     NaN   3.247285      3.401917   

      p_point  p_vectordaten  
23   0.030973       0.030973  
24   0.002168       0.011923  
25  14.288053       2.597828  

Chance us-gov pos: 17.395690064397407
proba us-gov pos: 0.9456394407331652


In [122]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)

print('True:', y_test)
print('Pred:', predicted)
print('Score Test:', text_clf.score(X_test, y_test))
print('Score Train', text_clf.score(X_train, y_train))

print('\nKlassifikations Report Testdaten:')
print(metrics.classification_report(y_test, predicted))
print('\nKlassifikations Report Trainingsdaten:')
print(metrics.classification_report(y_train, text_clf.predict(X_train)))

True: [ 8 12  6 23 12 12 12 12]
Pred: [ 8 23 17 12 12 12  7 12]
Score Test: 0.5
Score Train 0.5416666666666666

Klassifikations Report Testdaten:
              precision    recall  f1-score   support

           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
           8       1.00      1.00      1.00         1
          12       0.75      0.60      0.67         5
          17       0.00      0.00      0.00         0
          23       0.00      0.00      0.00         1

    accuracy                           0.50         8
   macro avg       0.29      0.27      0.28         8
weighted avg       0.59      0.50      0.54         8


Klassifikations Report Trainingsdaten:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       1.00      0.60      0.75         5
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         1


In [123]:
text_svm = Pipeline([
    ('vect', CountVectorizer(stop_words=stopwords)),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_svm.fit(X_train, y_train)
predicted = text_svm.predict(X_test)

print('True:', y_test)
print('Pred:', predicted)
print('Test:', text_svm.score(X_test, y_test))
print('Train', text_svm.score(X_train, y_train))

print('\nKlassifikations Report Testdaten:')
print(metrics.classification_report(y_test, predicted))
print('\nKlassifikations Report Trainingsdaten:')
print(metrics.classification_report(y_train, text_svm.predict(X_train)))

True: [ 8 12  6 23 12 12 12 12]
Pred: [ 8 23  6 23 12  4  7 23]
Test: 0.5
Train 0.9166666666666666

Klassifikations Report Testdaten:
              precision    recall  f1-score   support

           4       0.00      0.00      0.00         0
           6       1.00      1.00      1.00         1
           7       0.00      0.00      0.00         0
           8       1.00      1.00      1.00         1
          12       1.00      0.20      0.33         5
          23       0.33      1.00      0.50         1

    accuracy                           0.50         8
   macro avg       0.56      0.53      0.47         8
weighted avg       0.92      0.50      0.52         8


Klassifikations Report Trainingsdaten:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         5
           2       0.50      0.50      0.50         2
           3       0.00      0.00      0.00         1
           4