In [3]:
# full path to a text file (e.g. 'd:\\test\\text_sample.txt')
# cache files will be created in the same folder
file_path = "d:\\tf_df_test\\alice_in_wonderland_full.txt"

In [4]:
# region imports and logging
from document_parser import parser
from tf_idf_calculator import tfidf_calculator
from data_cache_repository import cache_repository
from document_loader_service import document_loader_service

import logging
from time import gmtime

import altair as alt
import pandas as pd

logging.Formatter.converter = gmtime # for milliseconds in logging
logging.basicConfig(filename='tfidf_app_log.log',
                    filemode='w',
                    format='%(asctime)s.%(msecs)03d | %(name)s | %(message)s', 
                    datefmt='%d/%m/%Y %H:%M:%S', 
                    level = logging.INFO)
logger = logging.getLogger(__name__)
#endregion

In [5]:
# region initialize calculator and get data
dc = cache_repository()
dp = parser()

#sep = '[new chapter]'
#dp.config(document_separator = '[new chapter]')

ld = document_loader_service(dc, dp)
data = ld.load_file(file_path, False)

calc = tfidf_calculator(data)

tfidf = calc.build_tf_idf_data(None,None)
tf = calc.tf_data
df = calc.df_data

# top X words from N documents
take_top = 10
flatten_tfidf = calc.flatten_data(doc_ids=[], top_w = take_top)
# endregion

In [6]:
# TF frame data
pd_tf_data = pd.DataFrame(data = flatten_tfidf, columns=['id', 'title', 'word', 'tf'])
pd_tf_data.head(5)

Unnamed: 0,id,title,word,tf
0,0,Document 0,bats,0.001868
1,0,Document 0,key,0.002802
2,0,Document 0,dark,0.001401
3,0,Document 0,poison,0.001401
4,0,Document 0,candle,0.001401


In [7]:
# DF frame data
pd_df_data = pd.DataFrame(data = flatten_tfidf, columns=['id', 'title', 'word', 'df'])
pd_df_data.head(5)

Unnamed: 0,id,title,word,df
0,0,Document 0,bats,0.083333
1,0,Document 0,key,0.25
2,0,Document 0,dark,0.083333
3,0,Document 0,poison,0.083333
4,0,Document 0,candle,0.083333


In [8]:
# TFIDF frame data for heatmap
pd_tfidf_data = pd.DataFrame(data = flatten_tfidf, columns=['id', 'title', 'word', 'tfidf'])
pd_tfidf_data.head(5)

Unnamed: 0,id,title,word,tfidf
0,0,Document 0,bats,0.004643
1,0,Document 0,key,0.003885
2,0,Document 0,dark,0.003482
3,0,Document 0,poison,0.003482
4,0,Document 0,candle,0.003482


In [9]:

# TFIDF heatmap
# https://altair-viz.github.io/gallery/layered_heatmap_text.html

# important words to color with mark_color
mark_words = []
mark_color='#fff'

base_heatmap = alt.Chart(
        pd_tfidf_data,
        height=300,
        title="TFIDIF heatmap of words per documents"
    ).encode(x = alt.X('rank:N'),
             y = alt.Y('title:O',sort='-x'),
             tooltip=['tfidf', 'word']
).transform_window(
    rank = "rank()",
    sort = [
        alt.SortField("tfidf", order="descending"),
        alt.SortField("word", order="descending"),
        ],
    groupby = ["id"],
)

heatmap = base_heatmap.mark_rect(opacity=0.6).encode(
   alt.Color('tfidf:Q',
            scale=alt.Scale(
            range=['#ffcccc', '#ff0000'], 
            interpolate='rgb',
            scheme='viridis'
            ),
   )
)

heatmap_text = base_heatmap.mark_text(baseline='middle').encode(
    alt.Text('label:N'),
    color = alt.condition(
        alt.FieldOneOfPredicate(field='word', oneOf = mark_words),
        alt.value(mark_color),
        alt.value('#000')
    ),
).transform_calculate(
    label=alt.datum.word
)

(heatmap + heatmap_text).properties(width=800)


In [10]:

# TF pie charts for top take_top words in first 2 documents

words = pd_tf_data['word'][0:take_top]
tfs = pd_tf_data['tf'][0:take_top]

doc0_data = pd.DataFrame({"word":words,"tf":tfs})

tf_chart0 = alt.Chart(doc0_data).mark_arc().encode(
    theta="tf",
    color="word",
    tooltip=['word', 'tf']
)

doc1_data = pd.DataFrame({"word":pd_tf_data['word'][take_top:take_top*2],"tf":pd_tf_data['tf'][take_top:take_top*2]})
tf_chart1 = alt.Chart(doc1_data).mark_arc().encode(
    theta="tf",
    color="word",
    tooltip=['word', 'tf']
)

tf_chart0 | tf_chart1

In [11]:
# DF horizontal bar chart
# https://altair-viz.github.io/gallery/bar_chart_horizontal.html

df_chart = alt.Chart(pd_df_data.head(100)).mark_bar().encode(
    x="word:N",
    y='df:Q',
    tooltip=['df', 'word']
)

df_chart.properties(width = 800)