In [10]:
import operator

# Preprocessing function
def preprocess(sentence):
    # Strip leading and trailing spaces, convert to lowercase, and split into tokens using space as a delimiter.
    preprocessed_sentence = sentence.strip().lower().split(" ")
    return preprocessed_sentence  # Return the preprocessed token list.

# File indexing function.
def indexing(file_name):
    # Tokenize each line in the given file and return the results as a list.

    file_tokens_pairs = []  # Initialize a list to store pairs of lines and their tokens.
    lines = open(file_name, "r", encoding="utf8").readlines()  # Read all lines from the file.

    for line in lines:
        tokens = preprocess(line)  # Preprocess each line to obtain tokens.
        file_tokens_pairs.append(tokens)  # Add the tokenized line to the list.
        print(tokens)  # Print the tokens (for debugging).
    
    return file_tokens_pairs  # Return the list of tokenized lines.

# Similarity calculation function.
def calc_similarity(preprocessed_query, preprocessed_sentences):
    score_dict = {}  # Initialize a dictionary to store similarity scores.

    # Lowercase the query for case-insensitive matching.
    query_str = ' '.join(preprocessed_query).lower()
    preprocessed_query = set(preprocess(query_str))  # Convert the query to a set of preprocessed tokens.

    for i, sentence in enumerate(preprocessed_sentences):
        # Lowercase the sentence for case-insensitive matching.
        sentence_str = ' '.join(sentence).lower()
        preprocessed_sentence = set(preprocess(sentence_str))  # Convert the sentence to a set of preprocessed tokens.

        # Calculate similarity as the ratio of common tokens to all tokens.
        all_tokens = preprocessed_query | preprocessed_sentence  # Union of tokens.
        same_tokens = preprocessed_query & preprocessed_sentence  # Intersection of tokens.
        similarity = len(same_tokens) / len(all_tokens)

        score_dict[i] = similarity  # Store the similarity score for this sentence.

    return score_dict  # Return the dictionary of similarity scores.

# 1. Indexing
## Source: https://github.com/jungyeul/korean-parallel-corpora
file_name = "jhe-koen-dev.en"
lines = open(file_name, "r", encoding="utf8").readlines()
file_tokens_pairs = []  # Initialize a list to store pairs of lines and their tokens.
for line in lines:
    tokens = line.strip().split(" ")  # Split each line into tokens.
    file_tokens_pairs.append(tokens)  # Add the tokenized line to the list.

# 2. Input the query
query = input("Enter an English query: ")  # Prompt the user to enter an English query.
preprocessed_query = query.strip().split(" ")  # Preprocess the query and tokenize it.
query_token_set = set(preprocessed_query)  # Convert the query to a set of preprocessed tokens.

# 3. Calculate similarities based on a common token set
score_dict = calc_similarity(preprocessed_query, file_tokens_pairs)  # Calculate similarity scores.

# 4. Sort the similarity list
sorted_score_list = sorted(score_dict.items(), key=operator.itemgetter(1), reverse=True)  # Sort scores in descending order.

# 5. Print the result
if sorted_score_list[0][1] == 0.0:
    print("There is no similar sentence.")
else:
    print("Rank", "Index", "Score", "Sentence", sep="\t")  # Print table headers.
    rank = 1
    for i, score in sorted_score_list:
        print(rank, i, score, ' '.join(file_tokens_pairs[i]), sep="\t")  # Print the ranked results.
        if rank == 10:
            break
        rank = rank + 1  # Increment the rank.


Enter an English query: Hello My name is Subin Ahn
Rank	Index	Score	Sentence
1	679	0.42857142857142855	My name is Mike.
2	526	0.25	Bob is my brother.
3	538	0.25	My hobby is traveling.
4	453	0.2222222222222222	My mother is sketching them.
5	241	0.2	My father is running with So-ra.
6	336	0.2	My family is at the park.
7	212	0.18181818181818182	My sister Betty is waiting for me.
8	505	0.16666666666666666	My little sister Annie is five years old.
9	610	0.14285714285714285	I would raise my voice and yell, "LUNCH IS READY!"
10	190	0.125	It is Sunday.
