In [1]:
import os, json
from collections import OrderedDict

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
with open('Daten/Unternehmen_prepocessed.json', 'r', encoding='utf-8') as files:
    data = json.load(files)

# Vectorize Text

## Create data

In [9]:
docs = [corp['explicit_lemmatization'] for corp in data]

## Get the words with the highest TfIdf Value

Function to split lemmatized description into earlier defined tokens

In [10]:
def splitter(txt):
    return txt.split(' ')

### Create TfIdf Vector

In [11]:
tf_vectorizer = TfidfVectorizer(
    tokenizer=splitter
    )
tf_vector = tf_vectorizer.fit_transform(docs)

## Create TfIdf list for all sectors

Create a dictionary for the tf_idf values for each word. Each entry will be indexed by the word. That way it can easily be searched through to built a thesaurus. 

In [12]:
idf_dict = {}

for word_id in range(0, tf_vector.shape[1]):
    idf_dict[tf_vectorizer.get_feature_names_out()[word_id]] = {"value" : tf_vector[max_values[0, word_id], word_id],
                     "sector": data[max_values[0, word_id]]["sector"], "nr": max_values[0, word_id]}
    if word_id % 10000 == 0:
        print(f'{word_id} words done.')

0 words done.
10000 words done.
20000 words done.
30000 words done.
40000 words done.


In [13]:
my_list = OrderedDict(sorted(idf_dict.items(), key=lambda x: x[1]['value'], reverse=True))

In [1]:
#my_list

In [15]:
for entry in my_list.items():
    entry[1]['nr'] = int(entry[1]['nr'])

## Safe list as Json File

In [16]:
with open('Daten/Tf_idf_dict.json', 'w') as file:
    json.dump(my_list, file)

# Create Thesaurus Search function for a single word

In [17]:
with open('Daten/Tf_idf_dict.json', 'r') as file:
    data = json.load(file, object_pairs_hook=OrderedDict)

In [20]:
def search_thesaurus(search_word, data=data):
    # Check if word exists in 'database'
    if search_word in data:
        print(f"{search_word} has a TfIdf Value of {data[search_word]['value']} and belongs to the {data[search_word]['sector']} sector.")
    else:
        print(f"{search_word} can not be found in the database.")

## Enter thesaurus search query

Returns TfIdf Value for a single search term and gives the sector the term belongs to.

In [21]:
search_thesaurus(input("Please enter your search term:\n"))

Please enter your search term:
rtl
rtl has a TfIdf Value of 0.9593074983770934 and belongs to the Communication Services sector.


# Search for the most important words in each category

In [22]:
double_sort = OrderedDict(sorted(idf_dict.items(), key=lambda x: (x[1]['sector'], -x[1]['value'])))

In [24]:
prev_sector = "Nothing"
counter = 0

pd_df = pd.DataFrame()
data = {'Word': [], 'Value': []}

sector_list = []

for i, word in enumerate(double_sort.items()):
    sector = word[1]['sector']
    if sector != prev_sector and prev_sector != "Nothing":
        counter = 0
        df = pd.DataFrame(data)
        pd_df = pd.concat([pd_df, df], axis = 1)
        data = {'Word': [], 'Value': []}
    if i == len(double_sort.items())-1:
        df = pd.DataFrame(data)
        pd_df = pd.concat([pd_df, df], axis = 1)
    if counter < 10:
        #print(counter)
        data['Word'].append(word[0])
        data['Value'].append(word[1]['value'])
        #print(word[0], word[1])
        counter += 1
    prev_sector = sector
    sector_list.append(sector)

In [25]:
sector_list = list(dict.fromkeys(sector_list))

In [26]:
pd_df.columns = pd.MultiIndex.from_product([sector_list, ['Word', 'Value']])

# Show highest TfIdf Values for each sector

In [27]:
pd.set_option('display.max_columns', None)
pd_df

Unnamed: 0_level_0,Basic Materials,Basic Materials,Communication Services,Communication Services,Consumer Cyclical,Consumer Cyclical,Consumer Defensive,Consumer Defensive,Energy,Energy,Financial Services,Financial Services,Healthcare,Healthcare,Industrials,Industrials,Real Estate,Real Estate,Technology,Technology,Utilities,Utilities
Unnamed: 0_level_1,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value,Word,Value
0,film,0.919199,rtl,0.959307,rakuten,0.957792,sugar,0.84762,egdon,0.779634,ofs,0.918089,nextgen,0.927108,trex,0.920542,aroundtown,0.832764,sapiens,0.962095,hydrogen,0.712654
1,itaconix,0.843219,itmedia,0.931728,mercari,0.92845,weis,0.838955,coal,0.719983,qatar,0.86235,veeva,0.919316,imi,0.912601,onsen,0.825289,netapp,0.954238,contourglobal,0.702817
2,paper,0.820751,lifull,0.915377,meliá,0.896822,chegg,0.819145,drilling,0.714626,naga,0.861745,cue,0.89629,arlo,0.89128,ichigo,0.814646,orbis,0.953279,excelerate,0.692075
3,pile,0.812007,ooma,0.907455,nordstrom,0.883046,coffee,0.806315,vista,0.708043,invesco,0.849809,graphene,0.883411,consulting,0.881872,katitas,0.801124,blackbaud,0.946865,ameren,0.669005
4,lithium,0.800193,imax,0.866856,hilton,0.878707,tea,0.780439,energean,0.694511,svf,0.83987,nexus,0.882708,container,0.868026,helical,0.791609,ccc,0.943483,clearway,0.66289
5,titanium,0.787481,iij,0.845762,pool,0.87422,lubrajel,0.758586,silverbow,0.668714,maebashi,0.828109,htg,0.792459,tape,0.82394,purplebricks,0.784104,btob,0.90376,saibu,0.656893
6,uranium,0.774399,marchex,0.84048,curves,0.872383,monster,0.753741,silica,0.664157,fund,0.82028,igm,0.783704,byrna,0.822741,stag,0.770515,hennge,0.900128,dominion,0.632377
7,wacker,0.770644,doordash,0.838935,denny,0.870067,tobacco,0.750917,laredo,0.660742,tetragon,0.816771,trxade,0.783503,diamond,0.813099,aimco,0.754892,zendesk,0.90003,pne,0.626896
8,brenntag,0.705959,litalico,0.833379,4sight,0.865943,lawson,0.724174,compression,0.646183,pawn,0.815639,impella,0.777785,jac,0.805602,patrizia,0.752548,fabasoft,0.89186,renewable,0.625533
9,pole,0.703941,google,0.807665,skechers,0.847803,reed,0.716093,pdc,0.643953,burtech,0.8067,aro,0.77751,ceiling,0.797292,bmo,0.74136,zoom,0.891435,gas,0.62355
