<a href="https://colab.research.google.com/github/francescovenco000004/peersv02/blob/main/peers_synthetic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



### iteration (training data)

In [None]:
# Import necessary libraries
import pandas as pd
import csv
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import os

# Define the QA model and tokenizer
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline('question-answering', model=model_name, tokenizer=model_name)

# Define the sentence transformer model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Change directory to the 'peers_synthetic' folder at the root of Google Drive
os.chdir('/content/drive/My Drive/peers_synthetic')

# Function to read CSV files and return the first column as a list
def read_csv(file_path):
    df = pd.read_csv(file_path)
    return df.iloc[:, 0].tolist()

# Read names and descriptions from CSV files
zava_names = read_csv('zava_names.csv')
zava_descriptions = read_csv('zava_description.csv')
borzo_names = read_csv('borzo_names.csv')
borzo_descriptions = read_csv('borzo_description.csv')

# Create dictionaries for bases and peers
bases = {
    "zava": zava_descriptions[0],
    "bozo": borzo_descriptions[0]
}

peers = {
    "zava": {zava_names[i]: zava_descriptions[i] for i in range(1, len(zava_names))},
    "bozo": {borzo_names[i]: borzo_descriptions[i] for i in range(1, len(borzo_names))}
}

# Questions to ask
questions = [
    "What does {} provide?",
    "What is {} vertical focus?",
    "Who are {} consumers?"
]

# Function to perform information extraction
def information_extraction(description, peer_name):
    extracted_info = {}
    for idx, question_template in enumerate(questions):
        question = question_template.format(peer_name)
        QA_input = {
            'question': question,
            'context': description,
        }
        res = qa_pipeline(QA_input)
        extracted_info[f"question_{idx+1}"] = res['answer']
    return extracted_info

# Extract information for all peers
def extract_all_information(bases, peers):
    extracted_data = {"zava": {}, "bozo": {}}

    for base in bases.keys():
        for peer, description in peers[base].items():
            extracted_info = information_extraction(description, peer)
            extracted_data[base][peer] = extracted_info

    return extracted_data

# Calculate cosine similarity
def calculate_cosine_similarity(extracted_data, bases):
    similarities = []
    for base_name, base_description in bases.items():
        base_extracted = information_extraction(base_description, base_name)

        for peer_name, peer_data in extracted_data[base_name].items():
            for question in range(1, 4):
                peer_info = peer_data[f"question_{question}"]
                base_info = base_extracted[f"question_{question}"]

                # Compute embeddings
                peer_embedding = sbert_model.encode(peer_info, convert_to_tensor=True)
                base_embedding = sbert_model.encode(base_info, convert_to_tensor=True)

                # Compute cosine similarity
                similarity = util.pytorch_cos_sim(peer_embedding, base_embedding).item()

                similarities.append({
                    "peer": peer_name,
                    "base": base_name,
                    "question": question,
                    "similarity": similarity
                })

    return similarities

# Extract information
extracted_data = extract_all_information(bases, peers)

# Compute cosine similarities
similarities = calculate_cosine_similarity(extracted_data, bases)

# Print results
for sim in similarities:
    print(f"Peer: {sim['peer']}, Base: {sim['base']}, Question: {sim['question']}, Similarity: {sim['similarity']:.4f}")


Peer: Healthily, Base: zava, Question: 1, Similarity: 0.4493
Peer: Healthily, Base: zava, Question: 2, Similarity: 0.3115
Peer: Healthily, Base: zava, Question: 3, Similarity: 0.4540
Peer: Tele clinic, Base: zava, Question: 1, Similarity: 0.4335
Peer: Tele clinic, Base: zava, Question: 2, Similarity: 0.4020
Peer: Tele clinic, Base: zava, Question: 3, Similarity: 0.3246
Peer: Practo, Base: zava, Question: 1, Similarity: 0.6025
Peer: Practo, Base: zava, Question: 2, Similarity: 0.4912
Peer: Practo, Base: zava, Question: 3, Similarity: 0.3350
Peer: Urban, Base: zava, Question: 1, Similarity: 0.3421
Peer: Urban, Base: zava, Question: 2, Similarity: 0.2763
Peer: Urban, Base: zava, Question: 3, Similarity: 0.2691
Peer: Intendu, Base: zava, Question: 1, Similarity: 0.1656
Peer: Intendu, Base: zava, Question: 2, Similarity: 0.6781
Peer: Intendu, Base: zava, Question: 3, Similarity: 0.2466
Peer: Watch your health, Base: zava, Question: 1, Similarity: 0.7765
Peer: Watch your health, Base: zava, 

In [None]:
# Import necessary libraries
import pandas as pd
import csv
from transformers import pipeline
from sentence_transformers import SentenceTransformer, util
import os

# Define the QA model and tokenizer
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline('question-answering', model=model_name, tokenizer=model_name)

# Define the sentence transformer model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Change directory to the 'peers_synthetic' folder at the root of Google Drive
os.chdir('/content/drive/My Drive/peers_synthetic')

# Function to read CSV files and return the first column as a list
def read_csv(file_path):
    df = pd.read_csv(file_path)
    return df.iloc[:, 0].tolist()

# Read names and descriptions from CSV files
zava_names = read_csv('csv/zava_names.csv')
zava_descriptions = read_csv('csv/zava_description.csv')
borzo_names = read_csv('csv/borzo_names.csv')
borzo_descriptions = read_csv('csv/borzo_description.csv')

# Create dictionaries for bases and peers
bases = {
    "zava": zava_descriptions[0],
    "bozo": borzo_descriptions[0]
}

peers = {
    "zava": {zava_names[i]: zava_descriptions[i] for i in range(1, len(zava_names))},
    "bozo": {borzo_names[i]: borzo_descriptions[i] for i in range(1, len(borzo_names))}
}

# Questions to ask
questions = [
    "What does {} provide?",
    "What is {} vertical focus?",
    "Who are {} consumers?"
]

# Function to perform information extraction
def information_extraction(description, peer_name):
    extracted_info = {}
    for idx, question_template in enumerate(questions):
        question = question_template.format(peer_name)
        QA_input = {
            'question': question,
            'context': description,
        }
        res = qa_pipeline(QA_input)
        extracted_info[f"question_{idx+1}"] = res['answer']
    return extracted_info

# Extract information for all peers
def extract_all_information(bases, peers):
    extracted_data = {"zava": {}, "bozo": {}}

    for base in bases.keys():
        for peer, description in peers[base].items():
            extracted_info = information_extraction(description, peer)
            extracted_data[base][peer] = extracted_info

    return extracted_data

# Calculate transformer-based similarity
def calculate_transformer_similarity(extracted_data, bases):
    similarities = []
    similarity_dict = {}

    for base_name, base_description in bases.items():
        base_extracted = information_extraction(base_description, base_name)
        similarity_dict[base_name] = {}

        for peer_name, peer_data in extracted_data[base_name].items():
            similarity_dict[base_name][peer_name] = {}

            for question in range(1, 3):
                peer_info = peer_data[f"question_{question}"]
                base_info = base_extracted[f"question_{question}"]

                # Compute embeddings
                peer_embedding = sbert_model.encode(peer_info, convert_to_tensor=True)
                base_embedding = sbert_model.encode(base_info, convert_to_tensor=True)

                # Compute similarity
                similarity = util.pytorch_cos_sim(peer_embedding, base_embedding).item()

                similarities.append({
                    "peer": peer_name,
                    "base": base_name,
                    "question": question,
                    "similarity": similarity
                })

                similarity_dict[base_name][peer_name][f"question_{question}"] = similarity

    return similarities, similarity_dict

# Extract information
extracted_data = extract_all_information(bases, peers)

# Compute transformer-based similarities
similarities, similarity_dict = calculate_transformer_similarity(extracted_data, bases)

# Print results
for sim in similarities:
    print(f"Peer: {sim['peer']}, Base: {sim['base']}, Question: {sim['question']}, Similarity: {sim['similarity']:.4f}")

# Save the results as CSV
similarity_df = pd.DataFrame(similarities)
similarity_df.to_csv('similarities2.csv', index=False)

# Save the results as a dictionary
import json
with open('similarity_dict.json', 'w') as f:
    json.dump(similarity_dict, f)


Peer: Healthily, Base: zava, Question: 1, Similarity: 0.4493
Peer: Healthily, Base: zava, Question: 2, Similarity: 0.3115
Peer: Tele clinic, Base: zava, Question: 1, Similarity: 0.4335
Peer: Tele clinic, Base: zava, Question: 2, Similarity: 0.4020
Peer: Practo, Base: zava, Question: 1, Similarity: 0.6025
Peer: Practo, Base: zava, Question: 2, Similarity: 0.4912
Peer: Urban, Base: zava, Question: 1, Similarity: 0.3421
Peer: Urban, Base: zava, Question: 2, Similarity: 0.2763
Peer: Intendu, Base: zava, Question: 1, Similarity: 0.1656
Peer: Intendu, Base: zava, Question: 2, Similarity: 0.6781
Peer: Watch your health, Base: zava, Question: 1, Similarity: 0.7765
Peer: Watch your health, Base: zava, Question: 2, Similarity: 0.4310
Peer: Cura, Base: zava, Question: 1, Similarity: 0.6641
Peer: Cura, Base: zava, Question: 2, Similarity: 0.4688
Peer: Health hero, Base: zava, Question: 1, Similarity: 0.3800
Peer: Health hero, Base: zava, Question: 2, Similarity: 0.7702
Peer: Biohm, Base: zava, Que

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.1/227.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transform

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

### Actual iteration

In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer
import json
import csv
import os

# Establish dictionaries
questions_dict = {
    'question1': "What does {} provide?",
    'question2': "What is {} vertical focus?",
    'question3': "Who are {} consumers?"
}

# Define the company descriptions
descriptions = {
    "instabase": "Instabase provides a platform for automating complex business processes and document workflows using machine learning and artificial intelligence. Targeting enterprises across various industries, their focus is on streamlining operations and improving efficiency through advanced automation and data processing solutions.",
    "frame": "Frame offers a cloud-based platform that enables users to run desktop applications in a web browser, targeting businesses and organizations needing flexible and scalable virtual desktop solutions. Their focus is on cloud computing and virtual desktop infrastructure (VDI), providing a seamless user experience for accessing applications remotely.",
    "rtbrick": "rtBrick delivers network automation solutions designed for service providers and telecom operators, targeting companies looking to modernize their network infrastructure. Their focus is on enabling programmable, cloud-native networks through software-defined networking (SDN) and network function virtualization (NFV).",
    "meshare": "meShare provides a cloud-based video surveillance and security solution, targeting homeowners and businesses needing reliable and scalable security systems. Their focus is on offering a comprehensive platform for managing and accessing video surveillance footage from multiple cameras and locations.",
    "backbox": "Backbox offers automated network security and backup solutions, targeting businesses needing robust security and disaster recovery systems for their network infrastructure. Their focus is on simplifying network management and ensuring data protection through automated backup and recovery processes.",
    "origin_protocol": "Origin Protocol develops decentralized applications and blockchain-based solutions to enhance digital transactions and data security. Targeting developers and enterprises, their focus is on enabling secure, transparent, and efficient transactions through blockchain technology and smart contracts.",
    "petuum": "Petuum provides a platform for machine learning and artificial intelligence that simplifies the development and deployment of AI models. Targeting enterprises and researchers, their focus is on making advanced AI accessible and scalable, facilitating the integration of machine learning into various applications and systems."
}

# Create the new dictionary format
formatted_descriptions = {
    'instabase': [
        descriptions['instabase'],
        "base",
        "instabase"
    ],
    'frame': [
        descriptions['frame'],
        "peer",
        "frame"
    ],
    'rtbrick': [
        descriptions['rtbrick'],
        "peer",
        "rtbrick"
    ],
    'meshare': [
        descriptions['meshare'],
        "peer",
        "meshare"
    ],
    'backbox': [
        descriptions['backbox'],
        "peer",
        "backbox"
    ],
    'origin_protocol': [
        descriptions['origin_protocol'],
        "peer",
        "origin_protocol"
    ],
    'petuum': [
        descriptions['petuum'],
        "peer",
        "petuum"
    ]
}

# Initialize the info_extracted dictionary with empty lists for each company
info_extracted = {company: [] for company in [
    'instabase',
    'frame',
    'rtbrick',
    'meshare',
    'backbox',
    'origin_protocol',
    'petuum'
]}





# Define the function for information extraction
def information_extraction(questions, descriptions, info_extracted, nlp):
    for description_key, description_list in descriptions.items():
        company_name = description_list[2]
        context = description_list[0]
        for question_key, question_template in questions.items():
            # Format the question with the company name
            question_text = question_template.format(company_name)
            QA_input = {
                'question': question_text,
                'context': context
            }
            # Get predictions
            res = nlp(QA_input)
            response = res['answer']
            # Store predictions
            info_extracted[company_name].append(response)

# Load model & tokenizer
model_name = "deepset/roberta-base-squad2"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Execute information extraction
information_extraction(questions_dict, formatted_descriptions, info_extracted, nlp)

# Print the results
for company, answers in info_extracted.items():
    print(f"{company}: {answers}")

import json


with open('/content/drive/My Drive/peers_synthetic/dictionary (v2)/instabase/info_extracted_instabase.json', 'w') as f:
    json.dump(info_extracted, f)





instabase: ['a platform for automating complex business processes and document workflows', 'streamlining operations and improving efficiency', 'enterprises']
frame: ['a cloud-based platform', 'cloud computing and virtual desktop infrastructure', 'businesses and organizations']
rtbrick: ['network automation solutions', 'enabling programmable, cloud-native networks', 'service providers and telecom operators']
meshare: ['cloud-based video surveillance and security solution', 'managing and accessing video surveillance footage from multiple cameras and locations', 'homeowners and businesses']
backbox: ['automated network security and backup solutions', 'simplifying network management and ensuring data protection', 'businesses']
origin_protocol: ['decentralized applications and blockchain-based solutions to enhance digital transactions and data security', 'enabling secure, transparent, and efficient transactions through blockchain technology and smart contracts', 'developers and enterprises'

In [None]:
import json


with open('/content/drive/My Drive/peers_synthetic/dictionary (v2)/instabase/description_dict_instabase.json', 'w') as f:
    json.dump(formatted_descriptions, f)

In [None]:
# Step 1: Install necessary libraries
#!pip install sentence-transformers scikit-learn

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the sBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')



# Get all the keys in the dictionary
keys = list(info_extracted.keys())

# Function to compute cosine similarities
def compute_similarities(info_extracted):
    similarities = {}

    # Iterate through each pair of keys
    for i in range(1, len(keys)):
        current_key = keys[i]
        similarities[current_key] = []

        for j in range(3):
            # Embed sentences
            embeddings1 = model.encode([info_extracted['instabase'][j]])
            embeddings2 = model.encode([info_extracted[current_key][j]])

            # Calculate cosine similarity
            similarity = cosine_similarity(embeddings1, embeddings2)[0][0]
            similarities[current_key].append(similarity)

    return similarities

# Calculate similarities
similarity_results = compute_similarities(info_extracted)

# Display the results
for key, values in similarity_results.items():
    print(f"Similarities with 'zava' and '{key}': {values}")


Similarities with 'zava' and 'frame': [0.3597218, 0.21445343, 0.7183789]
Similarities with 'zava' and 'rtbrick': [0.4145234, 0.21748137, 0.43484122]
Similarities with 'zava' and 'meshare': [0.10694204, 0.16815665, 0.46712226]
Similarities with 'zava' and 'backbox': [0.24286373, 0.28274375, 0.74353886]
Similarities with 'zava' and 'origin_protocol': [0.20307647, 0.27726978, 0.641783]
Similarities with 'zava' and 'petuum': [0.34303653, 0.30774298, 0.7322169]


In [None]:
import json
similarity_results_native = {key: [float(value) for value in values] for key, values in similarity_results.items()}

with open('/content/drive/My Drive/peers_synthetic/dictionary (v2)/instabase/similarity_results_instabase.json', 'w') as f:
    json.dump(similarity_results_native, f)

In [None]:
import json


with open('/content/drive/My Drive/peers_synthetic/dictionary (v2)/borzo/info_extracted_borzo.json', 'w') as f:
    json.dump(info_extracted, f)

In [None]:
import csv
import os
from google.colab import drive
drive.mount('/content/drive')

# Provided dictionary
data = {
    "frame": [0.3597218096256256, 0.21445342898368835, 0.7183789014816284], "rtbrick": [0.4145233929157257, 0.21748137474060059, 0.43484121561050415], "meshare": [0.10694204270839691, 0.16815665364265442, 0.46712225675582886], "backbox": [0.242863729596138, 0.28274375200271606, 0.7435388565063477], "origin_protocol": [0.20307646691799164, 0.27726978063583374, 0.6417829990386963], "petuum": [0.3430365324020386, 0.30774298310279846, 0.7322168946266174]
}

# Convert keys to lowercase and prepare data for CSV
csv_data = [[key.lower()] + value for key, value in data.items()]

# Define CSV file name and path
csv_file = "/content/drive/MyDrive/peers_synthetic/instabase_scores.csv"


# Create the output directory if it doesn't exist
output_dir = os.path.dirname(csv_file)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Write data to CSV
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(['company', 'score1', 'score2', 'score3'])
    # Write data rows
    writer.writerows(csv_data)

print(f"Data successfully written to {csv_file}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data successfully written to /content/drive/MyDrive/peers_synthetic/instabase_scores.csv
