In [44]:
import pandas as pd
import hashlib, json

In [45]:
df = pd.read_csv('./data/Cloud_Provider_Services.csv')

In [46]:
df.head()

Unnamed: 0,cloud_provider,category,service,description
0,aws,Analytics,Amazon Athena,An interactive query service that makes it eas...
1,aws,Analytics,Amazon CloudSearch,A managed service in the AWS Cloud that makes ...
2,aws,Analytics,Amazon Elastic MapReduce,A web service that makes it easy to process la...
3,aws,Analytics,Amazon Elasticsearch Service,A managed service that makes it easy to deploy...
4,aws,Analytics,Amazon EMR,A cloud big data platform for processing vast ...


In [47]:
def generate_hash_key(description, category, service, provider):
    # Concatenate the input parameters
    concatenated_string = f"{description}-{category}-{service}-{provider}"
    
    # Generate MD5 hash of the concatenated string
    hash_object = hashlib.md5(concatenated_string.encode())
    
    # Convert the hash to a hexadecimal string and truncate to 8 characters
    hash_key = hash_object.hexdigest()[:8]
    
    return hash_key

# Apply the generate_hash_key function to each row
df['id'] = df.apply(lambda row: generate_hash_key(row['description'], row['category'], row['service'], row["cloud_provider"]), axis=1)
df.head()

Unnamed: 0,cloud_provider,category,service,description,id
0,aws,Analytics,Amazon Athena,An interactive query service that makes it eas...,95825a76
1,aws,Analytics,Amazon CloudSearch,A managed service in the AWS Cloud that makes ...,4d78b518
2,aws,Analytics,Amazon Elastic MapReduce,A web service that makes it easy to process la...,1f41d3e8
3,aws,Analytics,Amazon Elasticsearch Service,A managed service that makes it easy to deploy...,cd65ebc2
4,aws,Analytics,Amazon EMR,A cloud big data platform for processing vast ...,65e1ebb7


In [48]:
from collections import defaultdict
hashes = defaultdict(list)
for index, row in df.iterrows():
    hashes[row['id']].append(row)
len(hashes), len(df)

(1302, 1302)

In [49]:
for k, v in hashes.items():
    if len(v) > 1:
        print(k, len(v))

In [50]:
df.head()

Unnamed: 0,cloud_provider,category,service,description,id
0,aws,Analytics,Amazon Athena,An interactive query service that makes it eas...,95825a76
1,aws,Analytics,Amazon CloudSearch,A managed service in the AWS Cloud that makes ...,4d78b518
2,aws,Analytics,Amazon Elastic MapReduce,A web service that makes it easy to process la...,1f41d3e8
3,aws,Analytics,Amazon Elasticsearch Service,A managed service that makes it easy to deploy...,cd65ebc2
4,aws,Analytics,Amazon EMR,A cloud big data platform for processing vast ...,65e1ebb7


In [51]:
# Drop the 'Unnamed: 0' column if it exists
if 'Unnamed: 0' in df.columns:
    df = df.drop(columns=['Unnamed: 0'])

# Save the DataFrame to a CSV file without the index
df.to_csv('./data/Cloud_Provider_Services_Hashed.csv', index=False)

### Take 25 random sample to generate ground truth dataset

In [52]:
RANDOM_SAMPLE = 25
# Load the CSV file into a DataFrame
df = pd.read_csv('./data/Cloud_Provider_Services_Hashed.csv')

# Select 70 random rows from the DataFrame
random_sample = df.sample(n=RANDOM_SAMPLE, random_state=2)

# Optionally, save the random sample to a new CSV file
random_sample.to_csv('./data/Cloud_Provider_Services_Hashed_random.csv', index=False)

In [53]:
df = pd.read_csv('./data/Cloud_Provider_Services_Hashed_random.csv')
df.head()

Unnamed: 0,cloud_provider,category,service,description,id
0,azure,Analytics,Application Insights,Offers application performance management and ...,dbfef36b
1,gcp,Storage,GCP Cloud Storage,Object storage service.,f4c5d80a
2,alicloud,Compute,Marketplace,Cloud marketplace,a9fdb68c
3,gcp,Service,Zync,Video transcoding service.,eddd7cff
4,ibmcloud,Service,paas_svc_plan_feat_blazemeter,BlazeMeter service for performance testing.,4b630982


### Used Github Copilot to generate similar descriptions from the dataset above

In [54]:
descriptions = [
    ("Offers application performance management and monitoring.", "dbfef36b"),
    ("Object storage service.", "f4c5d80a"),
    ("Cloud marketplace", "a9fdb68c"),
    ("Video transcoding service.", "eddd7cff"),
    ("BlazeMeter service for performance testing.", "4b630982"),
    ("AI lifecycle management", "7defc019"),
    ("Provides network monitoring and diagnostics.", "c1f0f51c"),
    ("SR-IOV enabled service", "01616f46"),
    ("Bare metal gateway license service", "649f8ff7"),
    ("Scalable object storage", "991a00e1"),
    ("AT&T services", "070acbdc"),
    ("IBM MongoDB is a managed service for deploying MongoDB instances.", "7fe00d80"),
    ("Key management services", "646945b4"),
    ("Network tunnel service", "5bb51002"),
    ("General network service", "eccf6aed"),
    ("Offers petabyte-scale data transport solutions.", "7dcc7a26"),
    ("Infrastructure deployment service.", "f2cfccc9"),
    ("General disk storage", "484b96db"),
    ("Virtual Private Cloud service", "0290a242"),
    ("Cloud integration services", "d659ca4a"),
    ("Offers operational insights workspaces.", "d7498c6e"),
    ("Virtual firewall with threat prevention", "d31d2df1"),
    ("General disk storage", "9b232c98"),
    ("Connect Compose services", "9fc2c94f"),
    ("Reserved instances for SQL databases in Azure.", "c6bb737c")
]

# Function to generate similar descriptions
def generate_similar_descriptions(description):
    if description == "Offers application performance management and monitoring.":
        return [
            "Provides application performance management and monitoring.",
            "Application performance management and monitoring service.",
            "Service for managing and monitoring application performance.",
            "Application performance monitoring and management offered.",
            "Management and monitoring of application performance."
        ]
    elif description == "Object storage service.":
        return [
            "Service for object storage.",
            "Provides object storage.",
            "Object storage provided.",
            "Storage service for objects.",
            "Service offering object storage."
        ]
    elif description == "Cloud marketplace":
        return [
            "Marketplace for cloud services.",
            "Cloud services marketplace.",
            "Platform for cloud marketplace.",
            "Marketplace offering cloud services.",
            "Cloud-based marketplace."
        ]
    elif description == "Video transcoding service.":
        return [
            "Service for video transcoding.",
            "Provides video transcoding.",
            "Video transcoding provided.",
            "Transcoding service for videos.",
            "Service offering video transcoding."
        ]
    elif description == "BlazeMeter service for performance testing.":
        return [
            "Performance testing service by BlazeMeter.",
            "BlazeMeter's performance testing service.",
            "Service for performance testing by BlazeMeter.",
            "BlazeMeter performance testing provided.",
            "Performance testing offered by BlazeMeter."
        ]
    elif description == "AI lifecycle management":
        return [
            "Management of AI lifecycle.",
            "AI lifecycle management service.",
            "Service for managing AI lifecycle.",
            "Lifecycle management for AI.",
            "AI lifecycle managed."
        ]
    elif description == "Provides network monitoring and diagnostics.":
        return [
            "Network monitoring and diagnostics provided.",
            "Service for network monitoring and diagnostics.",
            "Monitoring and diagnostics of network provided.",
            "Network diagnostics and monitoring service.",
            "Service offering network monitoring and diagnostics."
        ]
    elif description == "SR-IOV enabled service":
        return [
            "Service enabled with SR-IOV.",
            "SR-IOV service enabled.",
            "Enabled SR-IOV service.",
            "Service with SR-IOV enabled.",
            "SR-IOV enabled."
        ]
    elif description == "Bare metal gateway license service":
        return [
            "Service for bare metal gateway license.",
            "Bare metal gateway license provided.",
            "License service for bare metal gateway.",
            "Gateway license service for bare metal.",
            "Service offering bare metal gateway license."
        ]
    elif description == "Scalable object storage":
        return [
            "Object storage that is scalable.",
            "Scalable storage for objects.",
            "Storage service for scalable objects.",
            "Provides scalable object storage.",
            "Service offering scalable object storage."
        ]
    elif description == "AT&T services":
        return [
            "Services provided by AT&T.",
            "AT&T provided services.",
            "Service offerings by AT&T.",
            "AT&T service offerings.",
            "Services by AT&T."
        ]
    elif description == "IBM MongoDB is a managed service for deploying MongoDB instances.":
        return [
            "Managed service by IBM for deploying MongoDB instances.",
            "IBM's managed service for MongoDB instance deployment.",
            "Service by IBM for managing and deploying MongoDB instances.",
            "Deploy MongoDB instances with IBM's managed service.",
            "IBM offers a managed service for MongoDB instance deployment."
        ]
    elif description == "Key management services":
        return [
            "Services for key management.",
            "Key management provided.",
            "Management services for keys.",
            "Provides key management services.",
            "Service offering key management."
        ]
    elif description == "Network tunnel service":
        return [
            "Service for network tunneling.",
            "Provides network tunnel service.",
            "Network tunneling service provided.",
            "Service offering network tunnels.",
            "Network tunnel service available."
        ]
    elif description == "General network service":
        return [
            "Service for general network needs.",
            "Provides general network services.",
            "General network services provided.",
            "Service offering general network solutions.",
            "General network service available."
        ]
    elif description == "Offers petabyte-scale data transport solutions.":
        return [
            "Provides petabyte-scale data transport solutions.",
            "Data transport solutions at petabyte scale offered.",
            "Service offering petabyte-scale data transport.",
            "Petabyte-scale data transport solutions provided.",
            "Solutions for petabyte-scale data transport."
        ]
    elif description == "Infrastructure deployment service.":
        return [
            "Service for infrastructure deployment.",
            "Provides infrastructure deployment services.",
            "Infrastructure deployment service provided.",
            "Service offering infrastructure deployment.",
            "Deployment service for infrastructure."
        ]
    elif description == "General disk storage":
        return [
            "Service for general disk storage.",
            "Provides general disk storage.",
            "General disk storage provided.",
            "Service offering general disk storage.",
            "General storage for disks."
        ]
    elif description == "Virtual Private Cloud service":
        return [
            "Service for Virtual Private Cloud.",
            "Provides Virtual Private Cloud service.",
            "Virtual Private Cloud service provided.",
            "Service offering Virtual Private Cloud.",
            "Virtual Private Cloud available."
        ]
    elif description == "Cloud integration services":
        return [
            "Services for cloud integration.",
            "Provides cloud integration services.",
            "Cloud integration services provided.",
            "Service offering cloud integration.",
            "Integration services for cloud."
        ]
    elif description == "Offers operational insights workspaces.":
        return [
            "Provides operational insights workspaces.",
            "Operational insights workspaces offered.",
            "Service offering operational insights workspaces.",
            "Workspaces for operational insights provided.",
            "Operational insights workspaces available."
        ]
    elif description == "Virtual firewall with threat prevention":
        return [
            "Virtual firewall service with threat prevention.",
            "Provides virtual firewall with threat prevention.",
            "Service offering virtual firewall with threat prevention.",
            "Threat prevention with virtual firewall provided.",
            "Virtual firewall and threat prevention service."
        ]
    elif description == "Connect Compose services":
        return [
            "Services for Connect Compose.",
            "Provides Connect Compose services.",
            "Connect Compose services provided.",
            "Service offering Connect Compose.",
            "Connect Compose available."
        ]
    elif description == "Reserved instances for SQL databases in Azure.":
        return [
            "Reserved instances for SQL databases in Azure provided.",
            "Provides reserved instances for SQL databases in Azure.",
            "Service offering reserved instances for SQL databases in Azure.",
            "Azure reserved instances for SQL databases.",
            "SQL database reserved instances in Azure."
        ]

# Generate similar descriptions and assign hash keys
result = []
for description, hash_key in descriptions:
    similar_descriptions = generate_similar_descriptions(description)
    for desc in similar_descriptions:
        result.append({"description": desc, "id": hash_key})

# # Print the results
# for item in result:
#     print(f"Description: {item['description']}\nHash Key: {item['hash_key']}\n")
df_hash = pd.DataFrame(result)

In [55]:
df_hash.head()

Unnamed: 0,description,id
0,Provides application performance management an...,dbfef36b
1,Application performance management and monitor...,dbfef36b
2,Service for managing and monitoring applicatio...,dbfef36b
3,Application performance monitoring and managem...,dbfef36b
4,Management and monitoring of application perfo...,dbfef36b


In [56]:
doc_index = {d['id'] : d for d in df.to_dict(orient='records')}

In [57]:
doc_index

{'dbfef36b': {'cloud_provider': 'azure',
  'category': 'Analytics',
  'service': 'Application Insights',
  'description': 'Offers application performance management and monitoring.',
  'id': 'dbfef36b'},
 'f4c5d80a': {'cloud_provider': 'gcp',
  'category': 'Storage',
  'service': 'GCP Cloud Storage',
  'description': 'Object storage service.',
  'id': 'f4c5d80a'},
 'a9fdb68c': {'cloud_provider': 'alicloud',
  'category': 'Compute',
  'service': 'Marketplace',
  'description': 'Cloud marketplace',
  'id': 'a9fdb68c'},
 'eddd7cff': {'cloud_provider': 'gcp',
  'category': 'Service',
  'service': 'Zync',
  'description': 'Video transcoding service.',
  'id': 'eddd7cff'},
 '4b630982': {'cloud_provider': 'ibmcloud',
  'category': 'Service',
  'service': 'paas_svc_plan_feat_blazemeter',
  'description': 'BlazeMeter service for performance testing.',
  'id': '4b630982'},
 '7defc019': {'cloud_provider': 'ibmcloud',
  'category': 'Service',
  'service': 'aiopenscale',
  'description': 'AI lifecy

In [58]:
final_result = []
for index, row in df_hash.iterrows():
    for index2, row2 in df.iterrows():
        if row['id'] == row2['id']:
            final_result.append({
                "description": row['description'],
                "category": row2['category'],
                "service": row2['service'],
                "cloud_provider": row2['cloud_provider'],
                "id": row['id']
            })
final_result[:8]

[{'description': 'Provides application performance management and monitoring.',
  'category': 'Analytics',
  'service': 'Application Insights',
  'cloud_provider': 'azure',
  'id': 'dbfef36b'},
 {'description': 'Application performance management and monitoring service.',
  'category': 'Analytics',
  'service': 'Application Insights',
  'cloud_provider': 'azure',
  'id': 'dbfef36b'},
 {'description': 'Service for managing and monitoring application performance.',
  'category': 'Analytics',
  'service': 'Application Insights',
  'cloud_provider': 'azure',
  'id': 'dbfef36b'},
 {'description': 'Application performance monitoring and management offered.',
  'category': 'Analytics',
  'service': 'Application Insights',
  'cloud_provider': 'azure',
  'id': 'dbfef36b'},
 {'description': 'Management and monitoring of application performance.',
  'category': 'Analytics',
  'service': 'Application Insights',
  'cloud_provider': 'azure',
  'id': 'dbfef36b'},
 {'description': 'Service for object 

In [59]:
groud_truth_dataset = pd.DataFrame(final_result, index=None)

In [60]:
groud_truth_dataset.head(15)

Unnamed: 0,description,category,service,cloud_provider,id
0,Provides application performance management an...,Analytics,Application Insights,azure,dbfef36b
1,Application performance management and monitor...,Analytics,Application Insights,azure,dbfef36b
2,Service for managing and monitoring applicatio...,Analytics,Application Insights,azure,dbfef36b
3,Application performance monitoring and managem...,Analytics,Application Insights,azure,dbfef36b
4,Management and monitoring of application perfo...,Analytics,Application Insights,azure,dbfef36b
5,Service for object storage.,Storage,GCP Cloud Storage,gcp,f4c5d80a
6,Provides object storage.,Storage,GCP Cloud Storage,gcp,f4c5d80a
7,Object storage provided.,Storage,GCP Cloud Storage,gcp,f4c5d80a
8,Storage service for objects.,Storage,GCP Cloud Storage,gcp,f4c5d80a
9,Service offering object storage.,Storage,GCP Cloud Storage,gcp,f4c5d80a


In [61]:
groud_truth_dataset.shape

(125, 5)

### Dumping the ground truth dataset to a csv file

In [62]:
groud_truth_dataset.to_csv('./data/cloud_service_provider_ground_truth_dataset.csv', index=False)

## Elastic Search Retrieval


In [63]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')


In [64]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "cloud_provider": {"type": "text"},
            "category": {"type": "text"},
            "service": {"type": "text"},
            "description": {"type": "keyword"},
            "id": {"type": "keyword"}
        }
    }
}
index_name = "asset-categories"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'asset-categories'})

### Read the original dataset with hashes added

In [65]:
df = pd.read_csv('./data/Cloud_Provider_Services_Hashed.csv')

In [66]:
from tqdm.auto import tqdm
for doc in tqdm(df.to_dict(orient='records')):
    es_client.index(index=index_name, body=doc, id=doc['id'])

100%|██████████| 1302/1302 [00:02<00:00, 440.33it/s]


In [67]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["description^8", "category^5", "service", "cloud_provider"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [68]:
query = 'Provides video transcoding.'
results = elastic_search(query)
results

[{'cloud_provider': 'aws',
  'category': 'Service',
  'service': 'Amazon Kinesis Video Streams',
  'description': 'Video streaming service',
  'id': 'cf0ebd85'},
 {'cloud_provider': 'gcp',
  'category': 'Artificial Intelligence',
  'service': 'Cloud Video Intelligence API',
  'description': 'API for analyzing videos.',
  'id': '5785d934'}]

In [69]:
ground_truth = groud_truth_dataset.to_dict(orient='records')
relevant_total_results = []
relevance = []
for q in tqdm(ground_truth):
    doc_id = q['id']
    query = q['description']
    results = elastic_search(query)
    relevance = [r['id'] == doc_id for r in results]
    relevant_total_results.append(relevance)

100%|██████████| 125/125 [00:00<00:00, 516.18it/s]


In [70]:
relevant_total_results[:5]

[[False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False],
 [False, False, False, False, False]]

### hit_rate

In [71]:
def hit_rate(relevant_total_results):
    return sum([any(result) for result in relevant_total_results]) / len(relevant_total_results)

In [72]:
hit_rate(relevant_total_results)

0.28

In [73]:
def mrr(relevant_total_results):
    rr = 0
    for result in relevant_total_results:
        for i, r in enumerate(result):
            if r:
                rr += 1 / (i + 1)
    return rr / len(relevant_total_results)

### MRR

In [74]:
mrr(relevant_total_results)

0.18066666666666664

### hit_rate, mrr for elastic_search with ground_truth_data

In [75]:
hit_rate(relevant_total_results), mrr(relevant_total_results)

(0.28, 0.18066666666666664)

## Vector Search Evaluation

In [76]:
from retrieval import RetrieveContext
ground_truth = ground_truth_dataset.to_dict(orient='records')
relevant_total_results_vector_search = []
relevance_vector_search = []
for q in tqdm(ground_truth):
    doc_id = q['id']
    query = q['description']
    results = RetrieveContext().get_vector_context(query)[['description', 'category', 'service', 'cloud_provider', 'id']].to_dict(orient='records')
    relevance_vector_search = [r['id'] == doc_id for r in results]
    relevant_total_results_vector_search.append(relevance)

  0%|          | 0/125 [00:00<?, ?it/s]


<src.lance_db.lanceDB object at 0x104d65550>
Getting context from the table
table name: vector_db
first 5 data:                                          description  \
0  An interactive query service that makes it eas...   
1  A managed service in the AWS Cloud that makes ...   
2  A web service that makes it easy to process la...   
3  A managed service that makes it easy to deploy...   
4  A cloud big data platform for processing vast ...   

                                  vector_description cloud_provider  \
0  [0.0004424741, -0.0026784507, -0.121396326, 0....            aws   
1  [0.0004424741, -0.0026784507, -0.121396326, 0....            aws   
2  [0.0004424741, -0.0026784507, -0.121396326, 0....            aws   
3  [0.0004424741, -0.0026784507, -0.121396326, 0....            aws   
4  [0.0004424741, -0.0026784507, -0.121396326, 0....            aws   

                        service   category        id  
0                 Amazon Athena  Analytics  95825a76  
1            A

FileNotFoundError: Fts index does not exist. Please first call table.create_fts_index(['<field_names>']) to create the fts index.