In [1]:
import spacy
import json
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from IPython.display import display

## Open Data

In [2]:
with open('Daten/data_complete.json', 'r', encoding='utf-8') as data:
    desc = json.load(data)

## Check for some Metrics

Check some metrics and data to delete entries of sectors which do not have many examples

### Distribution of sectors

In [6]:
sectoren = {}
for comp in desc:
    #print(comp)
    #print(comp['sector'])
    if comp['sector'] in sectoren:
        sectoren[comp['sector']] += 1
    else:
        sectoren[comp['sector']] = 1
    #break
print(sectoren)

{'Financial Services': 2503, 'Healthcare': 1452, 'Communication Services': 403, 'Consumer Defensive': 410, 'Basic Materials': 594, 'Industrials': 1307, 'Consumer Cyclical': 1023, 'Energy': 338, 'Real Estate': 489, 'Technology': 1139, 'Utilities': 164, '': 23, 'Services': 3, 'Industrial Goods': 4, 'Financial': 3, 'Consumer Goods': 1}


In [7]:
len(desc)

9856

#### Bereinigen von zu kleinen Datenmengen

In [8]:
to_del = []
for val in sectoren.keys():
    if sectoren[val] < 100:
        to_del.append(val)
to_del

['', 'Services', 'Industrial Goods', 'Financial', 'Consumer Goods']

In [9]:
while True:
    c = 0
    for i, comp in enumerate(desc):
        if comp['sector'] in to_del:
            c += 1
            del desc[i]
    #print(c)
    if c == 0:
        break
c

0

In [10]:
len(desc)

9822

### Check 

In [11]:
sectoren = {}
for comp in desc:
    #print(comp)
    #print(comp['sector'])
    if comp['sector'] in sectoren:
        sectoren[comp['sector']] += 1
    else:
        sectoren[comp['sector']] = 1
    #break
print(sectoren, '\n', len(sectoren))

{'Financial Services': 2503, 'Healthcare': 1452, 'Communication Services': 403, 'Consumer Defensive': 410, 'Basic Materials': 594, 'Industrials': 1307, 'Consumer Cyclical': 1023, 'Energy': 338, 'Real Estate': 489, 'Technology': 1139, 'Utilities': 164} 
 11


## Lemmatization and stop word removal

Lemmatize the words to get a more uniform database. Delete stopwords from the database by using the stopword list from the spacy library. 

In [12]:
nlp = spacy.load("en_core_web_sm")
all_stopwords = nlp.Defaults.stop_words

In [13]:
len(all_stopwords)

326

In [14]:
#all_stopwords

Lemmatization of all words. Also, all stop words as well as numbers and punctuation are removed. This list is attached to the respective company in the JSON file.

### Rough lemmatization and data cleaning

In [15]:
for i, comp in enumerate(desc):
    doc = comp['description']
    # Löschen des eigenen Firmennamen aus der Beschreibung
    doc = nlp(doc)
    #print(doc)
    clean_desc = []
    # Bestimmte Token werden nicht gespeichert
    for token in doc:
        if token.pos_ == 'NUM' or token.pos_ == 'PUNCT':
            continue
        if token.lemma_.lower() in all_stopwords:
            continue
        #print(token, token.pos_)
        clean_desc.append(token.lemma_)
    #print(clean_desc)
    desc[i]['rough_lemmatization'] = ' '.join(clean_desc)
    #if i % 500 == 0:
        #print(i)

### Explicit lemmatization and data cleaning

#### Find abbreviations

In [16]:
abbreviations = set()
for i, comp in enumerate(desc):
    doc = comp['description']
    doc = nlp(doc)
    for token in doc:
        if token.text[-1] == '.' and token.pos_ != 'PUNCT':
            abbreviations.add(token.text)
    #if i % 500 == 0:
    #    print(i)

In [17]:
#abbreviations

Creating a list of abbreviations so these can be deleted. Since there are many unusual abbreviations deleting them is supposed to lead to a cleaner dataset and thereofore a better generalization when classifing the description. 

In [19]:
for i, comp in enumerate(desc):
    doc = comp['description']
    # Löschen des eigenen Firmennamen aus der Beschreibung
    while doc.find(comp['name']) != -1:
        doc = doc.replace(comp['name'], '')
    doc = nlp(doc)
    #print(doc)
    clean_desc = []
    # Bestimmte Token werden nicht gespeichert
    for token in doc:
        #print("%-20s | %-20s | %-5s | %s" %
        #  (token.text,  token.lemma_, token.pos_, spacy.explain(token.pos_)))
        if token.pos_ == 'NUM' or token.pos_ == 'PUNCT' or \
            token.pos_ == 'CCONJ' or len(token) < 3 or \
            token.pos_ == 'SPACE':
            continue
        if token.text.lower() in all_stopwords:
            continue
        if token.text in abbreviations:
            continue
        #print(token, token.pos_)
        clean_desc.append(token.lemma_)
    #print(clean_desc)
    desc[i]['explicit_lemmatization'] = ' '.join(clean_desc)
    desc[i]['search_engine'] = ' '.join([token.upper() for token in clean_desc])
    #if i % 500 == 0:
    #    print(i)

#### Save new json with additional preprocesses data

In [20]:
with open('Daten/Unternehmen_preprocessed.json', 'w') as wf:
    json.dump(desc, wf)

Unternehmen_rough_lemmatization enthält eine sehr einfache Version von Lemmatisierung. Nachdem einige anderen Analysen mit dem Datensatz durchgeführt wurden, mussten jedoch weitere Begriffe, bzw. Inhalte gefiltert werden. Dazu gehören durch die Tokenisierung enstandene Fehler. Dazu gehören z. B. einzelne Buchstaben aus Krankheiten wie 'Hepatitis B'. Außerdem wurde die Firmenbezeichnung aus der Beschreibung gefiltert. 
Außerdem wird eine extra "Datenbank" erzeugt, um in folgenden Schritten eine Suchmaschine erzeugen zu können.
Aufgrund der oftmals ungenauen Ergebnisse wurde auf eine Verarbeitung mittels eines "Porter Stemmer" verzichtet.

Die Ergebnisse der unterschiedlichen Lemmatisierung bzw. Vorverarbeitung werden abschließend verglichen?????????