In [3]:
import pandas as pd
import json

# Load your CSV file
csv_file_path = '/Users/mohdfuzailkhalil/Downloads/master_data.csv' 
df = pd.read_csv(csv_file_path)
print(df)
# Rename columns to standardized keys (optional but useful for downstream code)
df = df.rename(columns={
    'Questions': 'question',
    'Answer': 'ideal_answer',
    'Compliance': 'label'
})

# Convert to list of dictionaries
data = df.to_dict(orient='records')

# Save to JSON
json_file_path = 'questionnaire_data.json'
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ JSON file saved to: {json_file_path}")

                                            Questions       Compliance  \
0   Shiprocket  Tech Platform  follow functional s...  Fully Compliant   
1   The Next Generation Firewall (NGF) should have...  Fully Compliant   
2   Public Exposed web tier shall be behind EITC R...  Fully Compliant   
3   The SSL offloading enabled on Shiprocket Load ...  Fully Compliant   
4   Shiprocket  network security measures( WAF) te...  Fully Compliant   
5   The application shall be accessed through a we...  Fully Compliant   
6   Shiprocket platform has the capability to  cap...  Fully Compliant   
7   Database activity monitoring (DAM) to be consi...  Fully Compliant   
8   Shiprocket ensures that  proper security patch...  Fully Compliant   
9   Shiprocket must ensure the nodes authenticate ...  Fully Compliant   
10  Shiprocket is using the AWS cloud to enable fa...  Fully Compliant   
11  Shiprocket solution authenticate and authorize...  Fully Compliant   
12  Shiprocket applications ensure cla

In [5]:
import pandas as pd
import json
import re

# 📌 Path to your cleaned CSV file
csv_file_path = '/Users/mohdfuzailkhalil/Downloads/master_data.csv'   # Update with your actual path
json_file_path = 'questionnaire_data_cleaned.json'

# 🧹 Cleaning function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.strip()
    text = text.replace('\n', ' ').replace('-', ' ')
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s.,:/()-]', '', text)  # Remove unwanted symbols
    return text

# ✅ Load CSV
df = pd.read_csv(csv_file_path)

# ✅ Rename columns
df = df.rename(columns={
    'Questions': 'question',
    'Answer': 'ideal_answer',
    'Compliance': 'label'
})

# ✅ Fill empty labels with "Fully Compliant"
df['label'] = df['label'].fillna('Fully Compliant')

# ✅ Clean all fields
df['question'] = df['question'].apply(clean_text)
df['ideal_answer'] = df['ideal_answer'].apply(clean_text)
df['label'] = df['label'].apply(clean_text)

# ✅ Convert to JSON
data = df.to_dict(orient='records')

with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"✅ JSON file saved to: {json_file_path}")

✅ JSON file saved to: questionnaire_data_cleaned.json


In [6]:
print(data)

[{'question': 'Shiprocket Tech Platform follow functional segregation principles and solution shall have n tiers. N is the number of technical / business functions Guidelines : Enterprise applications The web tier, app tier and DB tier shall be segregated and access controlled. Containerized application Each service has been hosted on a different pod Each pod is mapped to a service Each business service have a specific name space All communication between pods to be clearly highlighted and approved by ISRM All inter pod communication to be access controlled via container firewall Each functional tier to have access control Back up VLANS shall be segregated for other function tier', 'label': 'Fully Compliant', 'ideal_answer': 'Shiprocket uses a multi tier infra for application. All components/Micro Services are deployed on different servers, and the Nginx server is used for proxy. For further details, please refer to the Shiprocket Architecture Document shared.'}, {'question': 'The Next

In [7]:
pip install sentence-transformers scikit-learn

Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.7.1-cp312-none-macosx_11_0_arm64.whl.metadata (29 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.33.0-py3-none-any.whl.metadata (14 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Collecting hf-xet<2.0.0,>=1.1.2 (from huggingface-hub>=0.20.0->sentence-transformers)
  Downloading hf_xet-1.1.4-cp37-abi3-macosx_11_0_arm64.whl.metadata (

In [8]:
from sentence_transformers import SentenceTransformer
import json
import numpy as np

# Load model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Load cleaned Q&A data
with open('questionnaire_data_cleaned.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# Compute and attach embeddings to each item
for item in data:
    embedding = model.encode(item['ideal_answer']).tolist()
    item['ideal_embedding'] = embedding

# Save the file with embedded vectors
with open('questionnaire_with_embeddings.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print("✅ Saved JSON with embeddings: questionnaire_with_embeddings.json")

  from .autonotebook import tqdm as notebook_tqdm


✅ Saved JSON with embeddings: questionnaire_with_embeddings.json
