In [37]:
!python app.py

]11;?\[6n * Serving Flask app 'app'
 * Debug mode: on
 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with watchdog (inotify)
 * Debugger is active!
 * Debugger PIN: 106-947-544
^C


In [35]:
!python ./indexing.py

]11;?\[6nIndexing completed and files saved.


In [22]:
#@title load and clean
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from IPython.core.display import display, HTML

df = pd.read_csv("./dataset-indo.csv")

def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z]', ' ', text)
        return text.lower()
    return ""

df['combined_content'] = df.apply(lambda row: ' '.join(row.astype(str)), axis=1)
df['cleaned_content'] = df['combined_content'].apply(clean_text)
df.head()

In [23]:
#@title query likelihood+smoothing

count_vectorizer = CountVectorizer()
tf_matrix = count_vectorizer.fit_transform(df['cleaned_content']).toarray()
vocab = count_vectorizer.get_feature_names_out()

# collection_freq = np.sum(tf_matrix, axis=0)
# collection_size = np.sum(collection_freq)

def add_one_smoothing(tf, doc_length):
    return (tf + 1) / (doc_length + len(vocab))

tf_df = pd.DataFrame(tf_matrix, columns=count_vectorizer.get_feature_names_out())
print(tf_df)

In [28]:
#@title tfidf+cosine

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_content'])

def tfidfcosine(query):
    query_tfidf = tfidf_vectorizer.transform([query])
    cosine_sim_scores = cosine_similarity(query_tfidf, tfidf_matrix).flatten()
    return cosine_sim_scores

tfidf_dense = tfidf_matrix.todense()
tfidf_df = pd.DataFrame(tfidf_dense, columns=tfidf_vectorizer.get_feature_names_out())

print(tfidf_df.head())
print(tfidf_df.columns[100:200])
print(tfidf_df[tfidf_df.columns[100:200]][:1])
tfidf_df.to_csv('filename.csv', index=False)


In [25]:
queries = [
    "demam batuk sakit tenggorokan"
]

In [26]:
#@title Printing the results
import time
def get_content_with_query_highlight(content, query, window=100):
    query_lower = " " + query.lower().split(" ")[0] + " "
    content_lower = content.lower()

    pos = content_lower.find(query_lower)

    if pos != -1:

        start = max(0, pos - window)
        end = min(len(content), pos + len(query) + window)


        highlighted_content = (
            content[start:pos] +
            f"{content[pos:pos + len(query)]}" +
            content[pos + len(query):end]
        )
    else:

        highlighted_content = content[:100] + "..."

    return highlighted_content


for query in queries:

    t1 = time.time()
    cosine_sim_scores = tfidfcosine(query)
    t2 = time.time()
    query_scores = query_likelihood_with_smoothing(query, tf_matrix, vocab)
    t3 = time.time()

    top_cosine_indices = np.argsort(cosine_sim_scores)[::-1][:10]
    top_likelihood_indices = np.argsort(query_scores)[::-1][:10]

    html_output = f"""
    <h3>Top 10 documents for query '{query}'</h3>
    <table style="width:100%; border-collapse: collapse; text-align: left;">
        <tr>
            <th style="width: 50%; border: 1px solid #ddd; padding: 8px;">Cosine Similarity (time: {t2-t1})</th>
            <th style="width: 50%; border: 1px solid #ddd; padding: 8px;">Query Likelihood (time: {t3-t2})</th>
        </tr>
    """

    for i in range(10):

        cosine_idx = top_cosine_indices[i]
        cosine_title = df.iloc[cosine_idx, 2]
        cosine_score = cosine_sim_scores[cosine_idx]
        cosine_content = df.iloc[cosine_idx]['cleaned_content']
        cosine_preview = get_content_with_query_highlight(cosine_content, query)


        likelihood_idx = top_likelihood_indices[i]
        likelihood_title = df.iloc[likelihood_idx, 2]
        likelihood_score = query_scores[likelihood_idx]
        likelihood_content = df.iloc[cosine_idx]['cleaned_content']
        likelihood_preview = get_content_with_query_highlight(likelihood_content, query)


        html_output += f"""
        <tr>
            <td style="border: 1px solid #ddd; padding: 8px;">
                <strong>Index:</strong> {cosine_idx}<br>
                <strong>Title:</strong> {cosine_title}<br>
                <strong>Content: </stroing> {get_content_with_query_highlight(cosine_content, query)}...<br>
                <strong>Score:</strong> {cosine_score:.10f}
            </td>
            <td style="border: 1px solid #ddd; padding: 8px;">
                <strong>Index:</strong> {likelihood_idx}<br>
                <strong>Title:</strong> {likelihood_title}<br>
                <strong>Content: </strong> {get_content_with_query_highlight(likelihood_content, query)}...<br>
                <strong>Score:</strong> {likelihood_score:.10f}
            </td>
        </tr>
        """

    html_output += "</table>"

    display(HTML(html_output))#@title Printing the results
import time
def get_content_with_query_highlight(content, query, window=100):
    query_lower = " " + query.lower().split(" ")[0] + " "
    content_lower = content.lower()

    pos = content_lower.find(query_lower)

    if pos != -1:

        start = max(0, pos - window)
        end = min(len(content), pos + len(query) + window)


        highlighted_content = (
            content[start:pos] +
            f"{content[pos:pos + len(query)]}" +
            content[pos + len(query):end]
        )
    else:

        highlighted_content = content[:100] + "..."

    return highlighted_content


for query in queries:

    t1 = time.time()
    cosine_sim_scores = tfidfcosine(query)
    t2 = time.time()

    top_cosine_indices = np.argsort(cosine_sim_scores)[::-1][:10]

    html_output = f"""
    <h3>Top 10 documents for query '{query}'</h3>
    <table style="width:100%; border-collapse: collapse; text-align: left;">
        <tr>
            <th style="width: 50%; border: 1px solid #ddd; padding: 8px;">Cosine Similarity (time: {t2-t1})</th>
        </tr>
    """

    for i in range(10):

        cosine_idx = top_cosine_indices[i]
        cosine_title = df.iloc[cosine_idx, 2]
        cosine_score = cosine_sim_scores[cosine_idx]
        cosine_content = df.iloc[cosine_idx]['cleaned_content']
        cosine_preview = get_content_with_query_highlight(cosine_content, query)

        html_output += f"""
        <tr>
            <td style="border: 1px solid #ddd; padding: 8px;">
                <strong>Index:</strong> {cosine_idx}<br>
                <strong>Title:</strong> {cosine_title}<br>
                <strong>Content: </stroing> {get_content_with_query_highlight(cosine_content, query)}...<br>
                <strong>Score:</strong> {cosine_score:.10f}
            </td>
        </tr>
        """

    html_output += "</table>"

    display(HTML(html_output))