## Accuracy chart

In [1]:
%pip install openpyxl

import pandas as pd

# read the Excel file into a dataframe
df = pd.read_excel('/workspaces/insupply/data/output/similarity_range_analysis.xlsx')

# output the head of the dataframe
print(df.head())

Note: you may need to restart the kernel to use updated packages.
                                               query  \
0  Provision of Event Management Services   [Cat ...   
1  PROVISION OF GDC RESOURCES    EXTRANET BANDWID...   
2  WOG Translation and Related Services Period Co...   
3  WOG Research Services Period Contract cum Fram...   
4  WOG Research Services Period Contract cum Fram...   

                         expected_description  top_similarity_score  \
0                    Other Services(OOE only)                 30.01   
1  Tech Services-Network and Cabling Services                 30.14   
2                    Other Services(OOE only)                 30.17   
3                    Other Services(OOE only)                 30.21   
4                    Other Services(OOE only)                 30.21   

   is_correct                                      top_5_matches  \
0        True  Other Services-Property Management Services; T...   
1        True  Utilities-Telecommu

In [2]:
import re

# convert all queries to lowercase and remove non-alphanumeric characters
def keep_alphanumeric(df, columns=None):
    if columns is None:
        columns = df.select_dtypes(include=['object']).columns
    for column in columns:
        df[column] = df[column].str.lower().apply(lambda x: re.sub(r'\W+', ' ', x))
    return df

df_filtered = keep_alphanumeric(df, columns=['query'])

# create a new column 'word_query' with the first word from the 'query' column
df_filtered['word_query'] = df_filtered['query'].apply(lambda x: x.split()[0] if x.split() else '')

# print the first 5 rows to verify the changes
print(df_filtered.head())

                                               query  \
0  provision of event management services cat 2 h...   
1  provision of gdc resources extranet bandwidth ...   
2  wog translation and related services period co...   
3  wog research services period contract cum fram...   
4  wog research services period contract cum fram...   

                         expected_description  top_similarity_score  \
0                    Other Services(OOE only)                 30.01   
1  Tech Services-Network and Cabling Services                 30.14   
2                    Other Services(OOE only)                 30.17   
3                    Other Services(OOE only)                 30.21   
4                    Other Services(OOE only)                 30.21   

   is_correct                                      top_5_matches  \
0        True  Other Services-Property Management Services; T...   
1        True  Utilities-Telecommunications; Other Services-L...   
2        True  Staff Well-Being(

In [3]:
# Get the count of each word in 'word_query' where 'is_correct' is False
word_counts = df_filtered[df_filtered['is_correct'] == False]['word_query'].value_counts().reset_index()

# Rename the columns for better readability
word_counts.columns = ['word', 'count']

# Sort the dataframe by count in descending order
word_counts = word_counts.sort_values(by='count', ascending=False)

# Output the full dataframe
print(word_counts.head(10))

        word  count
0  provision   3931
1        wog    344
2     supply    277
3       five     81
4      three     64
5      whole     53
6  framework     37
7      video     22
8          7     22
9     period     22


In [4]:
# Check if 'is_correct' column exists in the dataframe
if 'is_correct' in df_filtered.columns:
    # Get the top 5 words with the most counts of false matches
    top_10_false_words = word_counts.head(10)

    # Initialize lists to store the counts of true matches and accuracy percentages
    true_counts = []
    accuracy_percentages = []

    # Calculate the count of true matches and accuracy percentage for each word
    for word in top_10_false_words['word']:
        true_count = df_filtered[(df_filtered['word_query'] == word) & (df_filtered['is_correct'] == True)].shape[0]
        false_count = df_filtered[(df_filtered['word_query'] == word) & (df_filtered['is_correct'] == False)].shape[0]
        total_count = true_count + false_count
        accuracy_percentage = (true_count / total_count) * 100 if total_count > 0 else 0
        true_counts.append(true_count)
        accuracy_percentages.append(accuracy_percentage)

    # Add the true counts and accuracy percentages to the dataframe
    top_10_false_words['true_count'] = true_counts
    top_10_false_words['accuracy_percentage'] = accuracy_percentages

    # Output the new dataframe
    print(top_10_false_words)
else:
    print("The 'is_correct' column does not exist in the dataframe.")

        word  count  true_count  accuracy_percentage
0  provision   3931         935            19.214961
1        wog    344         303            46.831530
2     supply    277          70            20.172911
3       five     81          15            15.625000
4      three     64           9            12.328767
5      whole     53           0             0.000000
6  framework     37          14            27.450980
7      video     22           0             0.000000
8          7     22           0             0.000000
9     period     22           5            18.518519


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_false_words['true_count'] = true_counts
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_false_words['accuracy_percentage'] = accuracy_percentages


From the given tables we can conclude that anything marked with provision 