# TF-IDF demo

In [None]:
import h2o

h2o.init()

## Data

Data sources:

* https://github.com/h2oai/h2o-3
* https://en.wikipedia.org/wiki/Ice_hockey
* https://en.wikipedia.org/wiki/Antibody

In [None]:
documents = [
    'H2O is an in-memory platform for distributed, scalable machine learning. H2O uses familiar interfaces like R, Python, Scala, Java, JSON and the Flow notebook/web interface, and works seamlessly with big data technologies like Hadoop and Spark.',
    'Ice hockey is a contact team sport played on ice, usually in a rink, in which two teams of skaters use their sticks to shoot a vulcanized rubber puck into their opponent\'s net to score goals. The sport is known to be fast-paced and physical.',
    'An antibody (Ab), also known as an immunoglobulin (Ig), is a large, Y-shaped protein produced mainly by plasma cells that is used by the immune system to neutralize pathogens such as pathogenic bacteria and viruses.'
]
doc_ids = list(range(len(documents)))

input_frame = h2o.H2OFrame({'DocID': doc_ids, 'Document': documents},
                            column_types=['numeric', 'string'])
input_frame.head()

## TF-IDF with pre-processing

In [None]:
from h2o.information_retrieval.tf_idf import tf_idf

tf_idf_out = tf_idf(input_frame)
tf_idf_out.head()

In [None]:
VALUES_CNT_TO_SHOW = 3

def tf_idf_output_summary(tf_idf_out):
    for doc_id in doc_ids:
        sorted_doc_tf_idfs = tf_idf_out[tf_idf_out['DocID'] == doc_id].sort(by='TF-IDF')
        print('The highest TF-IDF values for document ' + doc_id +':')
        display(sorted_doc_tf_idfs.tail(VALUES_CNT_TO_SHOW))
        print('The lowest TF-IDF values for document ' + doc_id +':')
        display(sorted_doc_tf_idfs.head(VALUES_CNT_TO_SHOW))
        print('\n')

In [None]:
tf_idf_output_summary(tf_idf_out)

## TF-IDF without pre-processing

In [None]:
preprocessed_data = [(doc_id, word) for doc_id, document in enumerate(documents) for word in document.split()]

preprocessed_input_frame = h2o.H2OFrame(preprocessed_data,
                                        column_names=['DocID', 'Document'],
                                        column_types=['numeric', 'string'])
preprocessed_input_frame.head()

In [None]:
tf_idf_out = tf_idf(preprocessed_input_frame, preprocess=False)
tf_idf_out.head()

In [None]:
tf_idf_output_summary(tf_idf_out)

## Case insensitive TF-IDF

In [None]:
tf_idf_out = tf_idf(input_frame, case_sensitive=False)
tf_idf_out.head()

In [None]:
tf_idf_output_summary(tf_idf_out)