In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv("../data/podcast.csv")

In [3]:
df.shape

(17563, 535)

In [4]:
# keep a dictionary of index to name for later lookup
podcast_lookup = df.to_dict()['name']

In [5]:
# TF-IDF to build the desired vocabulary
tfidf_vectorizer = TfidfVectorizer(min_df=5)
X = tfidf_vectorizer.fit_transform(df['combined'].values.astype('U'))

In [6]:
X.shape

(17563, 41015)

In [7]:
ind = np.argwhere(tfidf_vectorizer.idf_>5).flatten()

In [8]:
ind.shape

(38565,)

In [9]:
vocab = (np.array(tfidf_vectorizer.get_feature_names())[ind]).tolist()

In [10]:
# Number of unique words in our vocabulary
len(vocab)

38565

In [11]:
df['combined'] = df['combined'].astype(str)

In [12]:
text = df['combined'].tolist()

In [13]:
len(text)

17563

In [14]:
# Reduce our text so that it only contains words in our desired vocabulary
vocab_dict = Counter(vocab)
new_text = list()

for row in text:
    
    new_pod_text = list()
    
    for word in row.split(" "):
        
        if word in vocab_dict:
            
            new_pod_text.append(word)
    
    new_text.append(" ".join(new_pod_text))

In [15]:
# Same number of rows
len(new_text)

17563

In [16]:
df['combined'] = new_text

In [17]:
# Vectorize the Tf-IDF selected words
count_vectorizer = CountVectorizer()
X2 = count_vectorizer.fit_transform(df['combined'])

In [18]:
words_names = count_vectorizer.get_feature_names()

In [19]:
word_mat = X2.toarray()

In [20]:
# rows = number of podcasts
# cols = our vocabulary
word_mat.shape

(17563, 38565)

In [21]:
# sort and reduce the matrix so each row is ordered by word frequency
top100 = word_mat.argsort()[:,::-1][:,:100]

In [22]:
word_cloud = {}

for ind,name in podcast_lookup.items():
    
    podcast = list()
    
    for i in top100[ind,:]:
        
        info = {"count": int(word_mat[ind, i]), "word": words_names[i]}
        
        podcast.append(info)
    
    word_cloud[name] = podcast

In [23]:
import json
with open('top100_words.json', 'w') as f:
    json.dump(word_cloud, f)