In [15]:
# full path to a text file (e.g. 'd:\\test\\text_sample.txt')
# cache files will be created in the same folder
file_path = "d:\\tf_df_test\\alice_in_wonderland_full.txt"

In [16]:
from document_parser import parser
from tf_idf_calculator import tfidf_calculator
from data_cache_repository import cache_repository
from document_loader_service import document_loader_service

import logging
from time import gmtime

import altair as alt
import pandas as pd

#region configure logging
logging.Formatter.converter = gmtime # for milliseconds in logging
logging.basicConfig(filename='tfidf_app_log.log',
                    filemode='w',
                    format='%(asctime)s.%(msecs)03d | %(name)s | %(message)s', 
                    datefmt='%d/%m/%Y %H:%M:%S', 
                    level = logging.INFO)
logger = logging.getLogger(__name__)
#endregion

In [17]:
# region initialize calculator and get data
dc = cache_repository()
dp = parser()

#sep = '[new chapter]'
#dp.config(document_separator = '[new chapter]')

ld = document_loader_service(dc, dp)
data = ld.load_file(file_path, False)

calc = tfidf_calculator(data)

tfidf = calc.build_tf_idf_data(None,None)
tf = calc.tf_data
df = calc.df_data
# endregion 



In [18]:
# TFIDF frame data for heatmap: top X words from N documents
flatten_tfidf = calc.flatten_data(['chapter 1', 'chapter 2'], top_w = 10)
pd_tfidf_data = pd.DataFrame(data = flatten_tfidf, columns=['id', 'title', 'word', 'tfidf'])
pd_tfidf_data.head()

Unnamed: 0,id,title,word,tfidf
0,0,Document: 0,bats,0.004643
1,0,Document: 0,key,0.003885
2,0,Document: 0,dark,0.003482
3,0,Document: 0,poison,0.003482
4,0,Document: 0,candle,0.003482


In [19]:

# DF frame data
pd_df_data = pd.DataFrame({
    "word":list(df.keys()),
    "df":list(df.values())
})

pd_df_data.head(10)

Unnamed: 0,word,df
0,alice,1.0
1,was,1.0
2,beginning,0.666667
3,to,1.0
4,get,0.916667
5,very,1.0
6,tired,0.333333
7,of,1.0
8,sitting,0.5
9,by,1.0


In [33]:

# TFIDF heatmap
# https://altair-viz.github.io/gallery/layered_heatmap_text.html

# important words to color with mark_color
mark_words = ['alice', 'book']
mark_color='#77ff33'

base_heatmap = alt.Chart(
        pd_tfidf_data,
        height=200,
        title="TFIDIF heatmap of words per documents"
    ).encode(x = alt.X('rank:N'),
             y = alt.Y('title:O',sort='-x'),
             tooltip=['tfidf', 'word']
).transform_window(
    rank = "rank()",
    sort = [
        alt.SortField("tfidf", order="descending"),
        alt.SortField("word", order="descending"),
        ],
    groupby = ["id"],
)

heatmap = base_heatmap.mark_rect(opacity=0.6).encode(
   alt.Color('tfidf:Q',
            scale=alt.Scale(
            range=['#ffcccc', '#ff0000'], 
            interpolate='rgb',
            scheme='viridis'
            ),
   )
)

heatmap_text = base_heatmap.mark_text(baseline='middle').encode(
    alt.Text('label:N'),
    color = alt.condition(
        alt.FieldOneOfPredicate(field='word', oneOf = mark_words),
        alt.value(mark_color),
        alt.value('#000')
    ),
).transform_calculate(
    label=alt.datum.word
)

(heatmap + heatmap_text).properties(width=1000)


     id         title         word     tfidf
0     0   Document: 0         bats  0.004643
1     0   Document: 0          key  0.003885
2     0   Document: 0         dark  0.003482
3     0   Document: 0       poison  0.003482
4     0   Document: 0       candle  0.003482
..   ..           ...          ...       ...
115  11  Document: 11        dream  0.004262
116  11  Document: 11  unimportant  0.003546
117  11  Document: 11       slates  0.003410
118  11  Document: 11        queen  0.002968
119  11  Document: 11     jury-box  0.002557

[120 rows x 4 columns]


In [21]:
# DF horizontal bar chart
# https://altair-viz.github.io/gallery/bar_chart_horizontal.html

df_chart = alt.Chart(pd_df_data.head(100)).mark_bar().encode(
    y='word:N',
    x="df:Q"
)
df_chart.properties()