In [1]:
import json, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.neighbors import  NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split

# Import data

In [2]:
with open('Daten/Unternehmen_prepocessed.json', 'r', encoding='utf8') as data:
    desc = json.load(data)

## Create dataset

In [3]:
all_mixed = []
for comp in desc:
    #print(comp)
    entry = {'txt': comp['explicit_lemmatization'], 'class': comp['sector']}
    all_mixed.append(entry)
len(all_mixed)

9822

# Necessary classes and functions

In [4]:
def classify(txt, v, vector):
    dummy = np.zeros(len(v.get_feature_names_out()))
    tmp_vectorizer = CountVectorizer(tokenizer = v.build_tokenizer())
    tmp_vector=tmp_vectorizer.fit_transform([txt])
    l = 0
    for f in tmp_vectorizer.vocabulary_:
        if f not in v.vocabulary_:
            continue

        index = v.vocabulary_[f]
        dummy[index] = tmp_vector[0, tmp_vectorizer.vocabulary_[f]] * v.idf_[index]
        l += dummy[index] ** 2
    l = np.sqrt(l)
    for i in range(vector.shape[1]):
        dummy[i] /= l
    #print(dummy)
    return dummy

In [5]:
class data_instance:
    def __init__(self, dataset):
        self.X_data = [entry['txt'] for entry in dataset]
        self.y_data = [entry['class'] for entry in dataset]    
        
    def split_data(self, random_state, test_size):
        self.X_train, self.X_test, self.y_train, self.y_test = \
        train_test_split(self.X_data, self.y_data, test_size=test_size, random_state=random_state)
        #self.X_test = [self.X_test]
        
    def vectorize(self, df_value, max_df):
        if max_df:
            self.vectorizer = TfidfVectorizer(max_df = df_value)
            self.vector = self.vectorizer.fit_transform(self.X_train)
        else:
            self.vectorizer = TfidfVectorizer(min_df = df_value)
            self.vector = self.vectorizer.fit_transform(self.X_train)
        
    def neighbors(self, n_neighbors, init=False):
        self.neigh = KNeighborsClassifier(n_neighbors = n_neighbors).fit(self.vector, self.y_train)
        if init:
            return self.neigh
        else:
            return self.evaluate(self.neigh)
        
    def randomForest(self, n_estimators, init=False):
        self.forest = RandomForestClassifier(n_estimators = n_estimators).fit(self.vector, self.y_train)     
        if init:
            return self.forest
        else:
            return self.evaluate(self.forest)
        
    def centroid(self, init=False):
        self.centr = NearestCentroid().fit(self.vector, self.y_train)
        if init:
            return self.centr
        else:
            return self.evaluate(self.centr)
        
    def evaluate(self, classifier):
        n_correct = 0
        for i in range(len(self.X_test)):
            if self.y_test[i] == classifier.predict([classify(self.X_test[i], self.vectorizer, self.vector)])[0]:
                n_correct += 1
            if i % 1500 == 0:
                print(i)
        score = n_correct / len(self.X_test)
        #print("calculated score =", score)
        return score

# Initialisierung Thesaurus

In [10]:
def ask_thesaurus(classification_method, text):
    if classification_method == 'N-Neighbor':
        n_neighbors = int(input('Anzahl der Neighbor eingeben: '))
        predictor = thesaurus.neighbors(n_neighbors, init=True)
        #print(predictor)
    if classification_method == 'Random Forest':
        n_estimator = int(input('Anzahl der Estimator eingeben: '))
        predictor = thesaurus.randomForest(n_estimator, init=True)
    else:
        predictor = thesaurus.centroid(init=True)
    sector = predictor.predict([classify(text, thesaurus.vectorizer, thesaurus.vector)])[0]
    print(f"The text entered is classified as a text from the {sector} sector.")

In [11]:
thesaurus = data_instance(all_mixed)
thesaurus.split_data(123, 0.33)
thesaurus.vectorize(1.0, True)

# Eingabe der Suchanfragen

Der Thesaurs kann über drei unterschiedliche Klassifizierungsmethoden genutzt werden. Um das bestmögliche Ergebnis zu erhalten, zeigt das Notebook '04 Klassifizierung' die Ergebnisse der unterschiedlichen Methoden. 

In [14]:
ask_thesaurus(input('Klassifizierungsmethode (N-Neighbor, Random Forest oder Centroid) eingeben '), input("Bitte hier zu klassifizierenden Text eingeben: "))

Klassifizierungsmethode (N-Neighbor, Random Forest oder Centroid) eingeben N-Neighbor
Bitte hier zu klassifizierenden Text eingeben: hat leather rtl
Anzahl der Neighbor eingeben: 2
The text entered is classified as a text from the Consumer Cyclical sector.
