In [59]:
import spacy
import json
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from IPython.display import display

## Open Data

In [60]:
with open('Daten/data_complete.json', 'r', encoding='utf-8') as data:
    desc = json.load(data)

In [61]:
for i in desc:
    print(i)
    break

{'name': 'Adara Acquisition Corp.', 'ticker': 'ADRA', 'sector': 'Financial Services', 'description': 'Adara Acquisition Corp. does not have significant operations. The company intends to effect a merger, capital stock exchange, asset acquisition, stock purchase, reorganization, or similar business combination with one or more businesses. It focuses on searching for businesses in the consumer products industry and related sectors, including those consumer industry businesses in the health and wellness, e-commerce, discretionary spending, and information technology sectors and related channels of distribution. The company was incorporated in 2020 and is based in Charlotte, North Carolina.'}


## Check for some Metrics

Check some metrics and data to delete entries of sectors which do not have many examples

### Verteilung der Sektoren

In [62]:
sectoren = {}
for comp in desc:
    #print(comp)
    #print(comp['sector'])
    if comp['sector'] in sectoren:
        sectoren[comp['sector']] += 1
    else:
        sectoren[comp['sector']] = 1
    #break
print(sectoren)

{'Financial Services': 2503, 'Healthcare': 1452, 'Communication Services': 403, 'Consumer Defensive': 410, 'Basic Materials': 594, 'Industrials': 1307, 'Consumer Cyclical': 1023, 'Energy': 338, 'Real Estate': 489, 'Technology': 1139, 'Utilities': 164, '': 23, 'Services': 3, 'Industrial Goods': 4, 'Financial': 3, 'Consumer Goods': 1}


In [63]:
len(desc)

9856

#### Bereinigen von zu kleinen Datenmengen

In [64]:
to_del = []
for val in sectoren.keys():
    if sectoren[val] < 100:
        to_del.append(val)
to_del

['', 'Services', 'Industrial Goods', 'Financial', 'Consumer Goods']

In [65]:
while True:
    c = 0
    for i, comp in enumerate(desc):
        if comp['sector'] in to_del:
            c += 1
            del desc[i]
    print(c)
    if c == 0:
        break
c

32
2
0


0

In [66]:
len(desc)

9822

### Check 

In [68]:
sectoren = {}
for comp in desc:
    #print(comp)
    #print(comp['sector'])
    if comp['sector'] in sectoren:
        sectoren[comp['sector']] += 1
    else:
        sectoren[comp['sector']] = 1
    #break
print(sectoren, '\n', len(sectoren))

{'Financial Services': 2503, 'Healthcare': 1452, 'Communication Services': 403, 'Consumer Defensive': 410, 'Basic Materials': 594, 'Industrials': 1307, 'Consumer Cyclical': 1023, 'Energy': 338, 'Real Estate': 489, 'Technology': 1139, 'Utilities': 164} 
 11


## Lemmatisierung und Stopwörter entfernen

Lemmatize the words to get a more uniform database. Delete stopwords from the database by using the stopword list from the spacy library. 

In [69]:
nlp = spacy.load("en_core_web_sm")
all_stopwords = nlp.Defaults.stop_words

In [70]:
len(all_stopwords)

326

In [71]:
all_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

Lemmatisierung alle Wörter. Außerdem werden alle Stopwörter sowie Zahlen und Interpunktionen entfernt. Diese Liste wird dem jeweiligen Unternehmen in der JSON Datei angehängt.

### Rough lemmatization and data cleaning

In [72]:
for i, comp in enumerate(desc):
    doc = comp['description']
    # Löschen des eigenen Firmennamen aus der Beschreibung
    doc = nlp(doc)
    #print(doc)
    clean_desc = []
    # Bestimmte Token werden nicht gespeichert
    for token in doc:
        if token.pos_ == 'NUM' or token.pos_ == 'PUNCT':
            continue
        if token.lemma_.lower() in all_stopwords:
            continue
        #print(token, token.pos_)
        clean_desc.append(token.lemma_)
    #print(clean_desc)
    desc[i]['rough_lemmatization'] = ' '.join(clean_desc)
    if i % 500 == 0:
        print(i)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


### Explicit lemmatization and data cleaning

#### Find abbreviations in

In [73]:
abbreviations = set()
for i, comp in enumerate(desc):
    doc = comp['description']
    doc = nlp(doc)
    for token in doc:
        if token.text[-1] == '.' and token.pos_ != 'PUNCT':
            abbreviations.add(token.text)
    if i % 500 == 0:
        print(i)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


In [74]:
abbreviations

{'.',
 '2A.',
 '2K.',
 '5G.',
 'A.',
 'A.C.I.',
 'A.G.',
 'A.H.',
 'A.P.',
 'A.S.',
 'AEL&P.',
 'A\\T.',
 'B.',
 'B.E.',
 'B.H.N.',
 'B.O.S.',
 'B.R.',
 'B.R.A.I.N.',
 'B.V.',
 'BBB\x96.',
 'BasX.',
 'Bros.',
 'C.',
 'C.A.T.',
 'C.C.',
 'C.E.',
 'C.H.',
 'C.T.I.',
 'C.V.',
 'Co.',
 'Corp.',
 'D.',
 'D.C.',
 'D.R.',
 'Dr.',
 'E.',
 'E.E.',
 'E.W.',
 'F.',
 'F.A.M.E.',
 'F.C.C.',
 'F.N.B.',
 'F.S.B.',
 'F.T.',
 'Feb.',
 'G.',
 'G.A.R.P.',
 'G.H.',
 'GmbH.',
 'H.',
 'H.B.',
 'H.D.',
 'H.I.G.',
 'H.I.S.',
 'H.K.',
 'H.U.',
 'I.',
 'I.D.',
 'I.Q.',
 'I.V.',
 'Ia.',
 'Inc.',
 'Ind.',
 'IoT.',
 'IonQ.',
 'J.',
 'J.A.',
 'J.A.B.',
 'J.B.',
 'J.L.',
 'J.P.',
 'J.W.',
 'Jr.',
 'K.',
 'K.K.',
 'KGaA.',
 'L.',
 'L.A.',
 'L.B.',
 'L.L.C.',
 'L.P.',
 'L.S.',
 'Ltd.',
 'M.',
 'M.D.',
 'M.D.C.',
 'M.H.C.',
 'M.P.',
 'Mr.',
 'Mrs.',
 'Mt.',
 'N.A.',
 'N.T.',
 'N.V.',
 'N.Y.',
 'Nov.',
 'O.',
 'O.S.K.',
 'O.W.N.',
 'P.',
 'P.A.M.',
 'P.L.',
 'P.L.C.',
 'P.S.',
 'Q.E.D.',
 'R.',
 'R.E.A.',
 'R.H.D.',
 'R

Creating a list of abbreviations so these can be deleted. Since there are many unusual abbreviations deleting them is supposed to lead to a cleaner dataset and thereofore a better generalization when classifing the description. 

In [75]:
for i, comp in enumerate(desc):
    doc = comp['description']
    # Löschen des eigenen Firmennamen aus der Beschreibung
    while doc.find(comp['name']) != -1:
        doc = doc.replace(comp['name'], '')
    doc = nlp(doc)
    #print(doc)
    clean_desc = []
    # Bestimmte Token werden nicht gespeichert
    for token in doc:
        #print("%-20s | %-20s | %-5s | %s" %
        #  (token.text,  token.lemma_, token.pos_, spacy.explain(token.pos_)))
        if token.pos_ == 'NUM' or token.pos_ == 'PUNCT' or \
            token.pos_ == 'CCONJ' or len(token) < 3 or \
            token.pos_ == 'SPACE':
            continue
        if token.text.lower() in all_stopwords:
            continue
        if token.text in abbreviations:
            continue
        #print(token, token.pos_)
        clean_desc.append(token.lemma_)
    #print(clean_desc)
    desc[i]['explicit_lemmatization'] = ' '.join(clean_desc)
    desc[i]['search_engine'] = ' '.join([token.upper() for token in clean_desc])
    if i % 500 == 0:
        print(i)

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500


#### Speichern neuer JSON

In [76]:
with open('Daten/Unternehmen_preprocessed.json', 'w') as wf:
    json.dump(desc, wf)

Unternehmen_rough_lemmatization enthält eine sehr einfache Version von Lemmatisierung. Nachdem einige anderen Analysen mit dem Datensatz durchgeführt wurden, mussten jedoch weitere Begriffe, bzw. Inhalte gefiltert werden. Dazu gehören durch die Tokenisierung enstandene Fehler. Dazu gehören z. B. einzelne Buchstaben aus Krankheiten wie 'Hepatitis B'. Außerdem wurde die Firmenbezeichnung aus der Beschreibung gefiltert. 
Außerdem wird eine extra "Datenbank" erzeugt, um in folgenden Schritten eine Suchmaschine erzeugen zu können.
Aufgrund der oftmals ungenauen Ergebnisse wurde auf eine Verarbeitung mittels eines "Porter Stemmer" verzichtet.

Die Ergebnisse der unterschiedlichen Lemmatisierung bzw. Vorverarbeitung werden abschließend verglichen?????????

## Wordcloud with different sectors

In [None]:
print(desc[0])

In [None]:
s[i] = s[i].upper()

In [None]:
comp_dir = {}
for comp in desc:
    if comp['sector'] not in comp_dir:
        comp_dir[comp['sector']] = []
    if len(comp_dir) == 11:
        break

In [None]:
for i, comp in enumerate(desc):
    comp_dir[comp['sector']].append(comp['clean_description'].upper())
    if i % 1000 == 0:
        print(f'{i} Firmenberichte verarbeitet')

In [None]:
for key in comp_dir.keys():
    lst = comp_dir[key]
    lst = ' '.join(lst)
    comp_dir[key] = lst

In [None]:
for key in comp_dir.keys():
    print(f'Wordcloud for the company sector {key}')
    wordcloud =  WordCloud(background_color="white", width = 5000,
                       height = 3500, max_words = 50).generate(comp_dir[key])
    plt.figure(figsize=(8, 16))
    plt.imshow(wordcloud)
    plt.show()