In [38]:
import json, re
import spacy

from sklearn.feature_extraction.text import CountVectorizer

# Read data

In [39]:
with open('Daten/Unternehmen_preprocessed.json', 'r', encoding='utf-8') as data:
    desc = json.load(data)

# Create general Information

## Functions

In [40]:
nlp = spacy.load("en_core_web_sm")

In [41]:
def my_tokenizer(txt):
    doc = nlp(txt)
    tokens = []
    for token in doc:
        if token.pos_ == 'PUNCT' or token.pos_ == 'SPACE':
                continue
        tokens.append(token.text)    
    #s = re.findall("[A-Za-zßäöüÄÖÜ]+",txt)
    #for i in range(len(s)):
    #    s[i] = s[i].upper().replace("Ä","AE").replace("Ö","OE").replace("Ü","UE")
    return ' '.join(tokens)

## Metadata

In [58]:
corpus = [item['description'] for item in desc]

In [59]:
docs = []
length_desc = []

In [60]:
for item in corpus:
    txt = my_tokenizer(item)
    docs.append(txt)
    length_desc.append(len(txt.split(' ')))

In [61]:
n_descriptions = len(corpus)

In [62]:
max_id = 0
min_id = 0
max_value = 0
min_value = 1000
sum_value = 0

for i, val in enumerate(length_desc):
    if val > max_value:
        max_value = val
        max_id = i
        #print(i, max_value)
    if val < min_value:
        min_value = val
        min_id = i
    sum_value += val

## Other information

In [63]:
def splitter(txt):
    return txt.split(' ')

In [64]:
len(corpus)

9822

### Words without filter (lemmatization etc.)

In [65]:
cv=CountVectorizer(

)
word_count_vector=cv.fit_transform(docs)

In [66]:
n_ungefiltert = len(cv.get_feature_names_out())

#### Word with rough filter

In [67]:
docs = [ber['rough_lemmatization'] for ber in desc]

In [68]:
cv=CountVectorizer(
    tokenizer=splitter
)
word_count_vector=cv.fit_transform(docs)

In [69]:
n_erster_Filter = len(cv.get_feature_names_out())

### Words with full filter

In [70]:
docs = [ber['explicit_lemmatization'] for ber in desc]

In [71]:
cv=CountVectorizer(
    tokenizer=splitter
    )
word_count_vector=cv.fit_transform(docs)

In [72]:
n_gefiltert = len(cv.get_feature_names_out())

# Final description and information

In [73]:
print(f"The corpus consists of {n_descriptions} different company descriptions.\n\
The longest description of the company {desc[max_id]['name']} includes {max_value} words.\n\
The shortest description of the company {desc[min_id]['name']} includes {min_value} words.\n\
The descriptions contain on average {round(sum_value/n_descriptions, 1)} words.\n\
This gives a total number of {sum_value} Terms (This also includes numbers (2019) or labels (Inc.).\n\
Unfiltered, the corpus thus contains {n_ungefiltert} different words.\n\
After the first, rough lemmatization, the corpus still contains {n_erster_Filter}.\n\
After final preprocessing, classification is performed with {n_gefiltert} distinct words.")

The corpus consists of 9822 different company descriptions.
The longest description of the company Hercules Capital Inc includes 773 words.
The shortest description of the company Psychiatric Solutions, Inc. (MM) includes 9 words.
The descriptions contain on average 139.7 words.
This gives a total number of 1372359 Terms (This also includes numbers (2019) or labels (Inc.).
Unfiltered, the corpus thus contains 47103 different words.
After the first, rough lemmatization, the corpus still contains 43275.
After final preprocessing, classification is performed with 41168 distinct words.


## Distribution of sectors

In [77]:
sectoren = {}
for comp in desc:
    #print(comp)
    #print(comp['sector'])
    if comp['sector'] in sectoren:
        sectoren[comp['sector']] += 1
    else:
        sectoren[comp['sector']] = 1
    #break

for sec in sectoren:
    print(f"The Sector '{sec}' contains {sectoren[sec]} different descriptions.")

The Sector 'Financial Services' contains 2503 different descriptions.
The Sector 'Healthcare' contains 1452 different descriptions.
The Sector 'Communication Services' contains 403 different descriptions.
The Sector 'Consumer Defensive' contains 410 different descriptions.
The Sector 'Basic Materials' contains 594 different descriptions.
The Sector 'Industrials' contains 1307 different descriptions.
The Sector 'Consumer Cyclical' contains 1023 different descriptions.
The Sector 'Energy' contains 338 different descriptions.
The Sector 'Real Estate' contains 489 different descriptions.
The Sector 'Technology' contains 1139 different descriptions.
The Sector 'Utilities' contains 164 different descriptions.


# Search engine

The search engine only searches the data set of reports that have already been fully processed. Accordingly, search terms must be entered in a normalized form. 

In [31]:
docs = [ber['search_engine'] for ber in desc]

In [32]:
cv=CountVectorizer(
    tokenizer=splitter
    )
word_count_vector = cv.fit_transform(docs)

In [34]:
index = {}
for token in cv.vocabulary_:
    index[token] = set()
for i in range(len(docs)):
    #print(i)
    for token, column in cv.vocabulary_.items():
        #print(token, column)
        if word_count_vector[i, column]:
            #print(i, token)
            index[token].add(i)
    #if i % 10 == 0:
    #    print(i)
#print(index) 

## Saving the search engine database

In [49]:
for key in index.keys():
    index_test[key] = list(index_test[key])    

In [51]:
with open('Daten/Unternehmen_search_engine.json', 'w') as wf:
    json.dump(index, wf)

## Prepare search engine

In [78]:
with open('Daten/Unternehmen_search_engine.json', 'r') as rf:
    db = json.load(rf)

In [79]:
def search(w): 
    if len(w) > 1:
        s_type = input("Type 'union' if you search for document which contain either word.\n\
Or type 'intersection' if you search for documents which contain all words: ").lower()
        if s_type == 'union':
            result_list = set()
            for sw in w:
                if sw in db.keys():
                    search_result = set(db[sw])
                else: 
                    search_result = set()
                result_list = result_list.union(search_result)
            result_list = sorted(result_list)   
            print(f"\n{len(result_list)} documents contain: {', '.join(w)}")
            for i in result_list:
                print(f"ID {i}: Firma: {desc[i]['name']}, Sektor: {desc[i]['sector']}")
        else:
            result_list = []
            print(w)
            for sw in w:
                if sw in db.keys():
                    search_result = set(db[sw])
                else: 
                    search_result = set()
                result_list.append(list(search_result))
            result_list = sorted(set.intersection(*map(set, result_list)))
            print(f"\n{len(result_list)} documents contain all words: {', '.join(w)}")
            for i in result_list:
                print(f"ID {i}: Corporate: {desc[i]['name']}, Sector: {desc[i]['sector']}")
    else:
        sw = w[0].lower()
        if sw in db:
            print(f"\n{len(db[sw])} documents contain: '{w[0]}'")
            for i in db[sw]:
                print(f"ID {i}: Corporate: {desc[i]['name']}, Sector: {desc[i]['sector']}")
        else:
            print("Search word can not be found.")

# Entry of the search query:

In [81]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
rtl

3 documents contain: 'rtl'
ID 3697: Corporate: Necessity Retail REIT Inc, Sector: Real Estate
ID 9652: Corporate: RTL GROUP, Sector: Communication Services
ID 1902: Corporate: Cadence Design Systems Inc, Sector: Technology


In [83]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
pharma medicine
Type 'union' if you search for document which contain either word.
Or type 'intersection' if you search for documents which contain all words: intersection
['pharma', 'medicine']

22 documents contain all words: pharma, medicine
ID 292: Corporate: ALLIANCE PHARMA PLC, Sector: Healthcare
ID 1249: Corporate: LABORATORIO REIG JOFRE SA, Sector: Healthcare
ID 1295: Corporate: AVEO Pharmaceuticals Inc, Sector: Healthcare
ID 1711: Corporate: Bellicum Pharmaceuticals Inc, Sector: Healthcare
ID 1853: Corporate: BridgeBio Pharma Inc, Sector: Healthcare
ID 2016: Corporate: Compugen Ltd, Sector: Healthcare
ID 2233: Corporate: CryoPort Inc, Sector: Industrials
ID 2423: Corporate: Evoke Pharma Inc, Sector: Healthcare
ID 2426: Corporate: Exelixis Inc, Sector: Healthcare
ID 2578: Corporate: Frequency Therapeutics Inc, Sector: Healthcare
ID 2902: Corporate: Horizon Therapeutics Public Limited Company, Sector: Healthcare
ID 2938: Corpor

In [84]:
search(input('Please enter searchword(s) seperated with a space: \n').split(' '))

Please enter searchword(s) seperated with a space: 
biogas coal
Type 'union' if you search for document which contain either word.
Or type 'intersection' if you search for documents which contain all words: union

150 documents contain: biogas, coal
ID 84: Firma: First Trust Energy Income and Growth Fund, Sektor: Financial Services
ID 303: Firma: CERES POWER HOLDINGS PLC, Sektor: Industrials
ID 481: Firma: ANGLO AMERICAN PLC, Sektor: Basic Materials
ID 487: Firma: ANGLO-EASTERN PLANTATIONS, Sektor: Consumer Defensive
ID 505: Firma: ANGLO PACIFIC GROUP PLC, Sektor: Basic Materials
ID 544: Firma: BHP BILLITON PLC, Sektor: Basic Materials
ID 649: Firma: EVRAZ PLC, Sektor: Basic Materials
ID 688: Firma: GLENCORE PLC, Sektor: Basic Materials
ID 689: Firma: CONTOURGLOBAL PLC, Sektor: Utilities
ID 903: Firma: R.E.A. HOLDINGS PLC, Sektor: Consumer Defensive
ID 925: Firma: SOUTH32 LTD, Sektor: Basic Materials
ID 972: Firma: SSE PLC, Sektor: Utilities
ID 990: Firma: TP ICAP PLC, Sektor: Financia