In [8]:
import operator

# Preprocess the input search sentence
def preprocess(sentence): 
    # Split the sentence into tokens
    preprocessed_sentence = sentence.strip().split(" ")
    return preprocessed_sentence

# Return token sets for each sentence in the file
def indexing(file_name):
    file_tokens_pairs = []
    
    # Read lines from the specified file
    lines = open(file_name, "r", encoding="utf8").readlines()
    for line in lines:
        tokens = preprocess(line) # Preprocess each line
        file_tokens_pairs.append(tokens)
        
    # Return a list of token sets for all sentences in the file
    return file_tokens_pairs 

# Dictionary to store file IDs and similarity scores between the query and sentences
def calc_similarity(preprocessed_query, preprocessed_sentences):
    score_dict = {}
    num_sentences = len(preprocessed_sentences)

    for i in range(num_sentences):
        # Convert tokens to lowercase for case-insensitive similarity
        file_tokens = []
        for token in preprocessed_sentences[i]:
            file_tokens.append(token.lower())

        query_tokens = []
        for token in preprocessed_query:
            query_tokens.append(token.lower())
            
        file_token_set = set(file_tokens) # Tokens set in a file
        query_token_set = set(query_tokens) # Tokens set in a query

        # Union of tokens in query and sentence
        all_tokens = query_token_set | file_token_set
        # Intersection of tokens in query and sentence
        same_tokens = query_token_set & file_token_set

        # Calulate the similarity
        similarity = len(same_tokens) / len(all_tokens)
        score_dict[i] = similarity # Store the similarity in the dictionary

    return score_dict

# 1. Indexing : Read and tokenize sentences from a file
file_name = "jhe-koen-dev.en"
file_tokens_pairs = indexing(file_name)

# 2. Input the query
query = input("영어 쿼리를 입력하세요.")

preprocessed_query = preprocess(query) # Preprocess the query
query_token_set = set(preprocessed_query) # Create a set of tokens from the query 


# 3. Calculate similarities based on a same token set
score_dict = calc_similarity(query_token_set, file_tokens_pairs)

# 4. Sort the similarity list
sorted_score_list = sorted(score_dict.items(), key = operator.itemgetter(1), reverse=True)

# 5. Print the result
if sorted_score_list[0][1] == 0.0:
    print("There is no similar sentence.")
else:
    print("rank", "Index", "score", "sentence", sep = "\t")
    rank = 1
    for i, score  in sorted_score_list:
        # Print the most similar sentences
        print(rank, i, score, ' '.join(file_tokens_pairs[i]), sep = "\t")
        if rank == 10:
            break
        rank = rank + 1
    

영어 쿼리를 입력하세요.Hello My name is Youngchae
rank	Index	score	sentence
1	679	0.5	My name is Mike.
2	526	0.2857142857142857	Bob is my brother.
3	538	0.2857142857142857	My hobby is traveling.
4	453	0.25	My mother is sketching them.
5	241	0.2222222222222222	My father is running with So-ra.
6	336	0.2222222222222222	My family is at the park.
7	212	0.2	My sister Betty is waiting for me.
8	505	0.18181818181818182	My little sister Annie is five years old.
9	610	0.15384615384615385	I would raise my voice and yell, "LUNCH IS READY!"
10	190	0.14285714285714285	It is Sunday.
