In [2]:
import json, re
import spacy

from sklearn.feature_extraction.text import CountVectorizer

# Read data

In [4]:
with open('Daten/Unternehmen_preprocessed.json', 'r', encoding='utf-8') as data:
    desc = json.load(data)

# Create general Information

## Functions

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
def my_tokenizer(txt):
    doc = nlp(txt)
    tokens = []
    for token in doc:
        if token.pos_ == 'PUNCT' or token.pos_ == 'SPACE':
                continue
        tokens.append(token.text)    
    #s = re.findall("[A-Za-zßäöüÄÖÜ]+",txt)
    #for i in range(len(s)):
    #    s[i] = s[i].upper().replace("Ä","AE").replace("Ö","OE").replace("Ü","UE")
    return ' '.join(tokens)

## Metadata

In [11]:
corpus = [item['description'] for item in desc]

In [12]:
docs = []
length_desc = []

In [13]:
for item in corpus:
    txt = my_tokenizer(item)
    docs.append(txt)
    length_desc.append(len(txt.split(' ')))
    break

In [14]:
n_descriptions = len(corpus)

In [15]:
max_id = 0
min_id = 0
max_value = 0
min_value = 1000
sum_value = 0

for i, val in enumerate(length_desc):
    if val > max_value:
        max_value = val
        max_id = i
        #print(i, max_value)
    if val < min_value:
        min_value = val
        min_id = i
    sum_value += val

## Other information

In [20]:
def splitter(txt):
    return txt.split(' ')

In [21]:
len(corpus)

9822

### Words without filter (lemmatization etc.)

In [22]:
cv=CountVectorizer(

)
word_count_vector=cv.fit_transform(docs)

In [23]:
n_ungefiltert = len(cv.get_feature_names_out())

#### Word with rough filter

In [24]:
docs = [ber['rough_lemmatization'] for ber in desc]

In [25]:
cv=CountVectorizer(
    tokenizer=splitter
)
word_count_vector=cv.fit_transform(docs)

In [26]:
n_erster_Filter = len(cv.get_feature_names_out())

### Words with full filter

In [27]:
docs = [ber['explicit_lemmatization'] for ber in desc]

In [28]:
cv=CountVectorizer(
    tokenizer=splitter
    )
word_count_vector=cv.fit_transform(docs)

In [29]:
n_gefiltert = len(cv.get_feature_names_out())

# Final description and information

In [30]:
print(f"Der Korpus besteht aus {n_descriptions} unterschiedlichen Unternehmensbeschreibungen.\n\
Die längste Beschreibung des Unternehmen {desc[max_id]['name']} umfasst dabei {max_value} Wörter.\n\
Die kürzeste Beschreibung des Unternehmen {desc[min_id]['name']} umfasst {min_value} Wörter.\n\
Die Beschreibungen enthalten im Durchschnitt {round(sum_value/n_descriptions, 1)} Wörter.\n\
Dadurch ergibt sich eine Gesamtanzahl von {sum_value} Begriffen (Hierin enthalten sind auch Zahlen (2019) oder Bezeichnungen (Inc.).\n\n\
Ungefiltert enthält der Korpus damit {n_ungefiltert} unterschiedliche Wörter.\n\
Nach der ersten, groben Lemmatisierung enthält der Korpus noch {n_erster_Filter}.\n\
Nach der endgültigen Vorverarbeitung wird die Klassifizierung mit {n_gefiltert} unterschiedlichen Wörtern durchgeführt.")

Der Korpus besteht aus 9822 unterschiedlichen Unternehmensbeschreibungen.
Die längste Beschreibung des Unternehmen Adara Acquisition Corp. umfasst dabei 83 Wörter.
Die kürzeste Beschreibung des Unternehmen Adara Acquisition Corp. umfasst 83 Wörter.
Die Beschreibungen enthalten im Durchschnitt 0.0 Wörter.
Dadurch ergibt sich eine Gesamtanzahl von 83 Begriffen (Hierin enthalten sind auch Zahlen (2019) oder Bezeichnungen (Inc.).

Ungefiltert enthält der Korpus damit 60 unterschiedliche Wörter.
Nach der ersten, groben Lemmatisierung enthält der Korpus noch 43275.
Nach der endgültigen Vorverarbeitung wird die Klassifizierung mit 41168 unterschiedlichen Wörtern durchgeführt.


# Search engine

The search engine only searches the data set of reports that have already been fully processed. Accordingly, search terms must be entered in a normalized form. 

In [31]:
docs = [ber['search_engine'] for ber in desc]

In [32]:
cv=CountVectorizer(
    tokenizer=splitter
    )
word_count_vector = cv.fit_transform(docs)

In [34]:
index = {}
for token in cv.vocabulary_:
    index[token] = set()
for i in range(len(docs)):
    #print(i)
    for token, column in cv.vocabulary_.items():
        #print(token, column)
        if word_count_vector[i, column]:
            #print(i, token)
            index[token].add(i)
    #if i % 10 == 0:
    #    print(i)
#print(index) 

## Saving the search engine database

In [49]:
for key in index.keys():
    index_test[key] = list(index_test[key])    

In [51]:
with open('Daten/Unternehmen_search_engine.json', 'w') as wf:
    json.dump(index, wf)

## Prepare search engine

In [52]:
with open('Daten/Unternehmen_search_engine.json', 'r') as rf:
    db = json.load(rf)

In [57]:
def search(w): # Achtung, geht nur ordentlich wenn genau ein echtes Wort übergeben wird!!
    if len(w) > 1:
        s_type = input("Type 'union' if you search for document which contain either word.\n\
Or type 'intersection' if you search for documents which contain all words: ").lower()
        if s_type == 'union':
            result_list = set()
            for sw in w:
                if sw in db.keys():
                    search_result = set(db[sw])
                else: 
                    search_result = set()
                result_list = result_list.union(search_result)
            result_list = sorted(result_list)   
            print(f"\n{len(result_list)} documents contain: {', '.join(w)}")
            for i in result_list:
                print(f"{i}: Firma: {desc[i]['name']}, Sektor: {desc[i]['sector']}")
        else:
            result_list = []
            print(w)
            for sw in w:
                if sw in db.keys():
                    search_result = set(db[sw])
                else: 
                    search_result = set()
                result_list.append(list(search_result))
            result_list = sorted(set.intersection(*map(set, result_list)))
            print(f"\n{len(result_list)} documents contain all words: {', '.join(w)}")
            for i in result_list:
                print(f"{i}: Corporate: {desc[i]['name']}, Sector: {desc[i]['sector']}")
    else:
        sw = w[0].lower()
        if sw in db:
            print(f"\n{len(db[sw])} documents contain: '{w[0]}'")
            for i in db[sw]:
                print(f"{i}: Corporate: {desc[i]['name']}, Sector: {desc[i]['sector']}")
        else:
            print("Search word can not be found.")

# Entry of the search query:

In [58]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
rtl

3 documents contain: 'rtl'
3697: Corporate: Necessity Retail REIT Inc, Sector: Real Estate
9652: Corporate: RTL GROUP, Sector: Communication Services
1902: Corporate: Cadence Design Systems Inc, Sector: Technology


In [59]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
pharma medicine
Type 'union' if you search for document which contain either word.
Or type 'intersection' if you search for documents which contain all words: intersection
['pharma', 'medicine']

22 documents contain all words: pharma, medicine
292: Corporate: ALLIANCE PHARMA PLC, Sector: Healthcare
1249: Corporate: LABORATORIO REIG JOFRE SA, Sector: Healthcare
1295: Corporate: AVEO Pharmaceuticals Inc, Sector: Healthcare
1711: Corporate: Bellicum Pharmaceuticals Inc, Sector: Healthcare
1853: Corporate: BridgeBio Pharma Inc, Sector: Healthcare
2016: Corporate: Compugen Ltd, Sector: Healthcare
2233: Corporate: CryoPort Inc, Sector: Industrials
2423: Corporate: Evoke Pharma Inc, Sector: Healthcare
2426: Corporate: Exelixis Inc, Sector: Healthcare
2578: Corporate: Frequency Therapeutics Inc, Sector: Healthcare
2902: Corporate: Horizon Therapeutics Public Limited Company, Sector: Healthcare
2938: Corporate: Healthcare Triangle Inc, Sector

In [61]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
biogas coal
Type 'union' if you search for document which contain either word.
Or type 'intersection' if you search for documents which contain all words: union

150 documents contain: biogas, coal
84: Firma: First Trust Energy Income and Growth Fund, Sektor: Financial Services
303: Firma: CERES POWER HOLDINGS PLC, Sektor: Industrials
481: Firma: ANGLO AMERICAN PLC, Sektor: Basic Materials
487: Firma: ANGLO-EASTERN PLANTATIONS, Sektor: Consumer Defensive
505: Firma: ANGLO PACIFIC GROUP PLC, Sektor: Basic Materials
544: Firma: BHP BILLITON PLC, Sektor: Basic Materials
649: Firma: EVRAZ PLC, Sektor: Basic Materials
688: Firma: GLENCORE PLC, Sektor: Basic Materials
689: Firma: CONTOURGLOBAL PLC, Sektor: Utilities
903: Firma: R.E.A. HOLDINGS PLC, Sektor: Consumer Defensive
925: Firma: SOUTH32 LTD, Sektor: Basic Materials
972: Firma: SSE PLC, Sektor: Utilities
990: Firma: TP ICAP PLC, Sektor: Financial Services
1062: Firma: MIQUEL Y COSTAS