In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]= "2"

import torch
import re
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np 
import jellyfish



In [2]:
# create Maroon Chat class
class MaroonChat:
    def __init__(self):
        self.base_model_id = "mistralai/Mistral-7B-v0.1"
        self.bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
        self.base_model = AutoModelForCausalLM.from_pretrained(
            self.base_model_id,  # Mistral, same as before
            quantization_config=self.bnb_config,  # Same quantization config as before
            device_map="auto",
            trust_remote_code=True,
            use_auth_token=False
        )
        self.eval_tokenizer = AutoTokenizer.from_pretrained(self.base_model_id, 
                                                            add_bos_token=True, 
                                                            trust_remote_code=True)
        self.model = PeftModel.from_pretrained(self.base_model, 
                                               "mistral-nlp-best/checkpoint-500-r32-alpha64")

    def generate(self, prompt):
        model_input = self.eval_tokenizer(prompt, return_tensors="pt").to("cuda")
        self.model.eval()

        with torch.no_grad():
            result = self.eval_tokenizer.decode(self.model.generate(**model_input, max_new_tokens=100, repetition_penalty=1.20)[0], skip_special_tokens=True)
            
            # filter out the prompt text
            result = re.split(r'(?<!\s\w)(?<!Hon)(?<!Dr)(?<!Mr)(?<!Mrs)(?<!Ms)(?<!No)(?<!R.A)(?<!Prof)(?<!Atty)\.', result)[0]
            
            # remove prompt text from the result
            result = result.replace(prompt, "")

            # remove ## as well
            result = result.replace("#", "")

            # remove preceding spaces
            result = result.strip()

            # remove texts after the first period
            result = re.split(r'\n', result)[0] + "."

            return result

In [3]:
mc = MaroonChat()



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# load evaluation data
chatbot = []
control = 1
df = pd.read_excel("datasets/secret_question.xlsx")
for question in df.question:
    chatbot.append(mc.generate(question))
    control += 1
    print(f"currently processing question no. {control}", end=" ")


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 2 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 3 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 4 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 5 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 6 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 7 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 8 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 9 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 10 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 11 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 12 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 13 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 14 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 15 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 16 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 17 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 18 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 19 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 20 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 21 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 22 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 23 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 24 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 25 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 26 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 27 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 28 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 29 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 30 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 31 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 32 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 33 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 34 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 35 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 36 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 37 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 38 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 39 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 40 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 41 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 42 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 43 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 44 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 45 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 46 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 47 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 48 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 49 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 50 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 51 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 52 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 53 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 54 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 55 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 56 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 57 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 58 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 59 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 60 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 61 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 62 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 63 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 64 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 65 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 66 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 67 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 68 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 69 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 70 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 71 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 72 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 73 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 74 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 75 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 76 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 77 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 78 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 79 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 80 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 81 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 82 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 83 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 84 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 85 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 86 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 87 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 88 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 89 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 90 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 91 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 92 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 93 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 94 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 95 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 96 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 97 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 98 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 99 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 100 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 101 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 102 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 103 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 104 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 105 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 106 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 107 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 108 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 109 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 110 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 111 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 112 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 113 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 114 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 115 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 116 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 117 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 118 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 119 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 120 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 121 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 122 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 123 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 124 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 125 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 126 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 127 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 128 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 129 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 130 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 131 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 132 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 133 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 134 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 135 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 136 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 137 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 138 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 139 

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


currently processing question no. 140 currently processing question no. 141 

In [5]:
chatbot

['The Oblation Statue was created by National Artist Guillermo E. Tolentino, a sculptor and professor at UP.',
 'The first one was constructed on February 28, 1937.',
 'Dr. Joel Joseph Marciano, Jr.',
 'Dr. Joel Joseph Marciano, Jr.',
 'The College of Science (COS) was created on February 12, 1954 by virtue of an Executive Order signed by then President Vicente Sinco.',
 'Engr.',
 'Prof. Alfredo E. Pascual was the first instructor assigned for the BS Civil Engineering program in UP Diliman.',
 'The College of Engineering (COE) started offering BS Chemical Engineering, BS Civil Engineering, and BS Electrical Engineering programs.',
 '102.',
 '1,702.',
 'The name of the person mentioned in the question is Dr. Maria Theresa T. Pascual.',
 'The FabLab is open to all UP students, faculty and staff.',
 'Which campus of UP has the email address epp.',
 'The UP PGC was established on February 12, 2015.',
 'TBA means “to be announced.',
 'The current UP Seal, which is a stylized version of the 

In [6]:
# add another column in df name chatbot 
df["prediction"] = chatbot

In [7]:
# save the result to mistra_rating_chatbot.xlsx
df.to_excel("datasets/hidden_generated_chatbot.xlsx", index=False)

In [8]:
result = pd.read_excel("datasets/hidden_generated_chatbot.xlsx")
result

Unnamed: 0,question,reference,prediction
0,Who made the oblation statue?,Guillermo Estrella Tolentino,The Oblation Statue was created by National Ar...
1,What year was the first sundial in UP Diliman ...,1957,"The first one was constructed on February 28, ..."
2,Who is the head of the Ubiquitous Computing La...,"Rowel Atienza, PhD","Dr. Joel Joseph Marciano, Jr."
3,Who is the head of the Digital Signal Processi...,Carl Timothy Tolentino,"Dr. Joel Joseph Marciano, Jr."
4,When was the College of Science in UP Diliman ...,1983-10-26 00:00:00,The College of Science (COS) was created on Fe...
...,...,...,...
135,Which field is UP Manila known for,Health Sciences,?.
136,What is Quezon Hall?,Quezon Hall is the main administration buildin...,"Quezon Hall, also known as UP Administration B..."
137,Where is the location of UP Cebu?,"UP Cebu is located in Gorordo Avenue, Lahug, C...",The University of the Philippines Cebu Campus ...
138,Who heads the UP Resilience Institute?,Dr. Mahar Lagmay,Dr. Alfredo Mahar Francisco A. Montecillo is t...


In [19]:
from nltk.translate import meteor_score
import evaluate

bleurt = evaluate.load("bleurt", module_type="metric")

ImportError: To be able to use evaluate-metric/bleurt, you need to install the following dependencies['bleurt'] using 'pip install git+https://github.com/google-research/bleurt.git' for instance'

In [11]:
embedding_model_name = "BAAI/bge-small-en-v1.5"
embedding_model = SentenceTransformer(embedding_model_name)

def calculate_similarity(reference, candidate):
    """
    Calculates text similarity between a reference and a candidate using both sentence embeddings and n-gram matching.

    Args:
        reference (str): The reference text string.
        candidate (str): The candidate text string.

    Returns:
        tuple: A tuple containing the exact match score and the n-gram matching score.
    """

    # Get embeddings for both texts
    embedding1 = embedding_model.encode(str(reference), convert_to_tensor=True)
    embedding2 = embedding_model.encode(str(candidate), convert_to_tensor=True)
    
    # Move tensors to CPU
    embedding1 = embedding1.cpu()
    embedding2 = embedding2.cpu()

    # Calculate cosine similarity using sentence embeddings
    # Embeddings: Understands meaning, good for similar ideas with different words.
    similarity_embeddings = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    # Tokenize the strings into words
    # Normalized Levenshtein Score (1-0)
    # Levenshtein: Counts edits, good for word order and typos, strict on word choice.
    levenshtein_score = 1 - (jellyfish.levenshtein_distance(str(reference), str(candidate)) / max(len(str(reference)), len(str(candidate))))

    # Normalized Jaro-Winkler Score (1-0)
    # Jaro-Winkler: Forgives some errors, good for names or addresses, balances exactness and flexibility.
    jaro_winkler_score = jellyfish.jaro_winkler_similarity(str(reference), str(candidate))

    # meteor score computation
    # METEOR score computation
    meteor_score_value = meteor_score.single_meteor_score(reference, candidate)

    # BLEURT score computation
    bleurt_score_value = bleurt_scorer.score([reference], [candidate])[0]

    return similarity_embeddings, levenshtein_score, jaro_winkler_score


In [13]:
df_mc = pd.read_excel("datasets/hidden_generated_chatbot.xlsx")
df_mc

Unnamed: 0,question,reference,prediction
0,Who made the oblation statue?,Guillermo Estrella Tolentino,The Oblation Statue was created by National Ar...
1,What year was the first sundial in UP Diliman ...,1957,"The first one was constructed on February 28, ..."
2,Who is the head of the Ubiquitous Computing La...,"Rowel Atienza, PhD","Dr. Joel Joseph Marciano, Jr."
3,Who is the head of the Digital Signal Processi...,Carl Timothy Tolentino,"Dr. Joel Joseph Marciano, Jr."
4,When was the College of Science in UP Diliman ...,1983-10-26 00:00:00,The College of Science (COS) was created on Fe...
...,...,...,...
135,Which field is UP Manila known for,Health Sciences,?.
136,What is Quezon Hall?,Quezon Hall is the main administration buildin...,"Quezon Hall, also known as UP Administration B..."
137,Where is the location of UP Cebu?,"UP Cebu is located in Gorordo Avenue, Lahug, C...",The University of the Philippines Cebu Campus ...
138,Who heads the UP Resilience Institute?,Dr. Mahar Lagmay,Dr. Alfredo Mahar Francisco A. Montecillo is t...


In [14]:
similarity_score, levenshtein_score, jaro_winkler_score = [], [], []

for (ground_truth, chatbot) in zip(df_mc["reference"], df_mc["prediction"]):
    similarity_embeddings, levenshtein, jaro_winkler = calculate_similarity(ground_truth, chatbot)
    similarity_score.append(similarity_embeddings)
    levenshtein_score.append(levenshtein)
    jaro_winkler_score.append(jaro_winkler)


In [15]:
df_mc["similarity_score"] = similarity_score
df_mc["levenshtein_score"] = levenshtein_score
df_mc["jaro_winkler_score"] = jaro_winkler_score

df_mc.to_excel("datasets/hidden_rating.xlsx", index=False)

In [16]:
df_mc = pd.read_excel("datasets/hidden_rating.xlsx")
df_mc.head(15)

Unnamed: 0,question,reference,prediction,similarity_score,levenshtein_score,jaro_winkler_score
0,Who made the oblation statue?,Guillermo Estrella Tolentino,The Oblation Statue was created by National Ar...,0.68649,0.179245,0.599955
1,What year was the first sundial in UP Diliman ...,1957,"The first one was constructed on February 28, ...",0.540765,0.058824,0.0
2,Who is the head of the Ubiquitous Computing La...,"Rowel Atienza, PhD","Dr. Joel Joseph Marciano, Jr.",0.559672,0.241379,0.542262
3,Who is the head of the Digital Signal Processi...,Carl Timothy Tolentino,"Dr. Joel Joseph Marciano, Jr.",0.558166,0.172414,0.535528
4,When was the College of Science in UP Diliman ...,1983-10-26 00:00:00,The College of Science (COS) was created on Fe...,0.518008,0.022556,0.300251
5,Who was the first dean of the College of Engin...,Mr. W.J. Colbert,Engr.,0.448857,0.0625,0.508333
6,Who was the first instructor to be appointed f...,Mr. Jose P. Katigbak,Prof. Alfredo E. Pascual was the first instruc...,0.507213,0.118182,0.495455
7,What was the first program offered by the Coll...,BS Civil Engineering,The College of Engineering (COE) started offer...,0.676403,0.147059,0.538674
8,What is the total number of faculty in the Col...,2966,102.,0.580864,0.0,0.0
9,What is the total number of students in the Co...,27536,1702.,0.610263,0.166667,0.455556


In [17]:
# calculate average scores 
print(f"Average Similarity Score: {df_mc.similarity_score.mean()}")
print(f"Average Levenshtein Score: {df_mc.levenshtein_score.mean()}")
print(f"Average Jaro-Winkler Score: {df_mc.jaro_winkler_score.mean()}")

Average Similarity Score: 0.6513097275580678
Average Levenshtein Score: 0.17286649518778568
Average Jaro-Winkler Score: 0.5121258587716855
