In [1]:
import pandas as pd
import numpy as np
from math import log
import matplotlib.pyplot as plt

In [2]:
def get_tf(corpus):
    tf_dict = []
    for doc in corpus:
        doc_count = {}
        doc_len = len(doc)
        for word in doc:
            doc_count[word] = doc_count.get(word, 0) + 1
        tf_dict.append([i/doc_len for i in doc_count.values()])
    return tf_dict

In [3]:
def get_idf(words_set, corpus):
    docs_count = len(corpus)
    words_dict = {}
    idf = {}
    for word in words_set:
        for doc in corpus:
            if word in doc:
                words_dict[word] = words_dict.get(word, 0) + 1
    for word in words_dict.keys():
        idf[word] = log(docs_count/words_dict[word])
    return idf, words_dict

In [4]:
def get_tfidf(idf, tf, corpus, words_dict):
    tfidf = []
    avg_tfidf = {}
    for i in range(len(corpus)-1):
        doc_tfidf = []
        doc = corpus[i]
        tf_doc = tf[i]
        for j in range(len(doc)-1):
            doc_tfidf.append(tf_doc[j]*idf[doc[j]])
            avg_tfidf[doc[j]] = avg_tfidf.get(doc[j], 0) + (tf_doc[j]*idf[doc[j]])
        tfidf.append(doc_tfidf)
    for word, tfidf in avg_tfidf.items():
        avg_tfidf[word] = avg_tfidf[word]/words_dict[word]
    return avg_tfidf

In [5]:
finance = pd.read_csv('Finance.csv')

print(finance.shape)
n_docs = finance.shape[0]

(109, 1)


In [11]:
finance.head()

Unnamed: 0,Keyskills
0,"Credit Underwriting, management, mortgage, mor..."
1,"Financial Modelling, modeling, private equity,..."
2,"finance, sales, back office, customer care, re..."
3,"php, management, compliance, legal compliance,..."
4,"Due diligence, Forecasting, PTP, Analytics, OT..."


In [6]:
corpus = []
for i in range(n_docs):
    corpus.append(list(finance.iloc[i])[0].split(','))
# print(corpus)

words_set = set()
for doc in corpus:
    words = doc
    words_set = words_set.union(set(words))
n_words_set = len(words_set)   #·Number of unique words in the 
print(n_words_set)

386


In [7]:
tf = get_tf(corpus)
idf, words_dict = get_idf(words_set, corpus)
# print(corpus[-1], tf[-1])

tfidf = get_tfidf(idf, tf, corpus, words_dict)

sorted_idf = sorted(idf.items(), key=lambda x: x[1])
# print(*sorted_idf[1:11], sep="\n")
top_words = np.array(sorted_idf[1:11])[:, 0]
print(top_words)

['finance' 'accounting' 'financial analysis' 'receivable' 'Forecasting'
 'sap' 'analysis' 'balance' 'auditing' 'reporting']


In [13]:
finance[['Keyskills']]

Unnamed: 0,Keyskills
0,"Credit Underwriting, management, mortgage, mor..."
1,"Financial Modelling, modeling, private equity,..."
2,"finance, sales, back office, customer care, re..."
3,"php, management, compliance, legal compliance,..."
4,"Due diligence, Forecasting, PTP, Analytics, OT..."
...,...
104,"Financial planning,kpi,planning,microsoft,acco..."
105,"Senior Accountant,tds,gst,financial statements..."
106,"ERP,Excel,Reconciliation,Quick Books,Charted A..."
107,"consumer goods,forecasting,management,financia..."


In [8]:
from wordcloud import WordCloud 
wordcloud = WordCloud(width = 800, height = 800,
                background_color ='white',
                min_font_size = 10).generate(finance.iloc[1])

TypeError: expected string or bytes-like object

In [None]:
plt.figure(figsize = (8, 8), facecolor = None)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()