In [1]:
import pandas as pd
import json
import os


In [2]:
qa_df = pd.read_csv('qa.csv')
qa_df.head()

Unnamed: 0,#,Company,Difficulty,Question,Answer
0,1,BitPay,Easy,When and where does BitPay collect personal data?,Main Answer:\nBitPay collects personal data du...
1,2,BitPay,Easy,Where can BitPay users exercise their data rig...,Main Answer:\nBitPay's Data Subject Rights Por...
2,3,BitPay,Easy,When does BitPay require informed consent for ...,Main Answer:\nBitPay's informed consent manage...
3,4,BitPay,Medium,How does BitPay handle international data tran...,Main Answer:\nBitPay's international data tran...
4,5,BitPay,Medium,How does BitPay address its dual role as contr...,Main Answer:\nBitPay's privacy policy establis...


In [3]:
qa_df = qa_df.iloc[:, 1:]
qa_df.head()

Unnamed: 0,Company,Difficulty,Question,Answer
0,BitPay,Easy,When and where does BitPay collect personal data?,Main Answer:\nBitPay collects personal data du...
1,BitPay,Easy,Where can BitPay users exercise their data rig...,Main Answer:\nBitPay's Data Subject Rights Por...
2,BitPay,Easy,When does BitPay require informed consent for ...,Main Answer:\nBitPay's informed consent manage...
3,BitPay,Medium,How does BitPay handle international data tran...,Main Answer:\nBitPay's international data tran...
4,BitPay,Medium,How does BitPay address its dual role as contr...,Main Answer:\nBitPay's privacy policy establis...


In [4]:
qa_df.groupby('Company')['Difficulty'].count().reset_index()

Unnamed: 0,Company,Difficulty
0,Bilt,8
1,BitPay,9
2,Circle,7
3,GDPR,40
4,Gemini,9
5,Klarna,8
6,Moonpay,8
7,Plaid,7
8,Remitly,8
9,Revolut,9


In [5]:
difficulty_counts = qa_df.groupby(['Company', 'Difficulty']).size().reset_index(name='Count')  # Group and count
difficulty_pivot = difficulty_counts.pivot(index='Company', columns='Difficulty', values='Count').fillna(0).reset_index()
difficulty_pivot


Difficulty,Company,Easy,Hard,Medium
0,Bilt,2.0,3.0,3.0
1,BitPay,3.0,3.0,3.0
2,Circle,2.0,2.0,3.0
3,GDPR,0.0,20.0,20.0
4,Gemini,3.0,3.0,3.0
5,Klarna,2.0,3.0,3.0
6,Moonpay,2.0,4.0,2.0
7,Plaid,3.0,2.0,2.0
8,Remitly,3.0,3.0,2.0
9,Revolut,3.0,3.0,3.0


In [None]:
print(qa_df.shape)

(122, 4)


In [None]:
df = qa_df.copy()

In [None]:
# Rename the column 'Company' to 'company'
df = df.rename(columns={"Company": "company"})

# Ensure all values in the 'company' column are in lowercase
df["company"] = df["company"].str.lower()

# Display the updated DataFrame
print(df.head())


  company Difficulty                                           Question  \
0  bitpay       Easy  When and where does BitPay collect personal data?   
1  bitpay       Easy  Where can BitPay users exercise their data rig...   
2  bitpay       Easy  When does BitPay require informed consent for ...   
3  bitpay     Medium  How does BitPay handle international data tran...   
4  bitpay     Medium  How does BitPay address its dual role as contr...   

                                              Answer  
0  Main Answer:\nBitPay collects personal data du...  
1  Main Answer:\nBitPay's Data Subject Rights Por...  
2  Main Answer:\nBitPay's informed consent manage...  
3  Main Answer:\nBitPay's international data tran...  
4  Main Answer:\nBitPay's privacy policy establis...  


In [None]:
df[df['company'] == 'gemini']

Unnamed: 0,company,Difficulty,Question,Answer
81,gemini,Easy,What types of personal data does Gemini collec...,Main Answer:\nGemini collects multiple categor...
82,gemini,Easy,When does Gemini share data with regulatory au...,Main Answer:\nGemini implements specific timin...
83,gemini,Easy,Where does Gemini process customer trading data?,Main Answer:\nGemini maintains primary trading...
84,gemini,Medium,How does Gemini handle personal data collected...,Main Answer:\nGemini processes blockchain tran...
85,gemini,Medium,What rights do Gemini customers have regarding...,Main Answer:\nGemini provides comprehensive ri...
86,gemini,Medium,How does Gemini process data for market monito...,Main Answer:\nGemini implements specific proto...
87,gemini,Hard,How does Gemini implement international transf...,Main Answer:\nGemini implements transfer safeg...
88,gemini,Hard,How does Gemini balance AML requirements with ...,Main Answer:\nGemini manages AML compliance th...
89,gemini,Hard,What automated decisions affect Gemini's digit...,Main Answer:\nGemini's automated systems affec...


### Train and test json split

In [None]:
train_set = []
test_set = []

for company, group in df.groupby("company"):
    for difficulty in ["Easy", "Medium", "Hard"]:
        difficulty_questions = group[group["Difficulty"] == difficulty]

        if not difficulty_questions.empty:
            test_sample = difficulty_questions.sample(n=1, random_state=42)
            test_set.extend(test_sample.to_dict(orient="records"))

            remaining = difficulty_questions.drop(test_sample.index)
            train_set.extend(remaining.to_dict(orient="records"))
        else:
            print(f"No questions for {company} at {difficulty} level")

with open("train_set.json", "w") as train_file:
    json.dump(train_set, train_file, indent=4)

with open("test_set.json", "w") as test_file:
    json.dump(test_set, test_file, indent=4)

print(f"Train Set: {len(train_set)} questions")
print(f"Test Set: {len(test_set)} questions")


No questions for gdpr at Easy level
Train Set: 90 questions
Test Set: 32 questions


In [None]:
def process_company_data(train_set_file, company_name, output_dir):
    """
    Processes data for a specific company from a training set and saves it as JSON and CSV.

    Args:
        train_set_file (str): Path to the JSON file containing the training set.
        company_name (str): Name of the company to process.
        output_dir (str): Directory to save the processed files.
    """
    # Load the training set
    with open(train_set_file, "r") as file:
        train_set = json.load(file)

    # Filter questions for the given company
    questions = [
        {key: value for key, value in entry.items() if key != "Difficulty"}
        for entry in train_set if entry["company"].lower() == company_name.lower()
    ]

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Save the filtered questions as JSON
    json_output_path = os.path.join(output_dir, f"{company_name}.json")
    with open(json_output_path, "w") as json_file:
        json.dump(questions, json_file, indent=4)

    # Convert to DataFrame and save as CSV
    co_df = pd.DataFrame(questions)
    csv_output_path = os.path.join(output_dir, f"{company_name}.csv")
    co_df.to_csv(csv_output_path, index=False)

    print(f"Total questions for {company_name} in the train set: {len(questions)}")
    print(f"Saved {company_name}.json and {company_name}.csv in {output_dir}.")


process_company_data(
    train_set_file="train_set.json",
    company_name="gdpr",
    output_dir="./train/gdpr"
)


Total questions for gdpr in the train set: 38
Saved gdpr.json and gdpr.csv in ./train/gdpr.


In [None]:
with open("train_set.json", "r") as train_file:
    train_set = json.load(train_file)

questions = [
    {key: value for key, value in entry.items() if key != "Difficulty"}
    for entry in train_set if entry["company"] == "bitpay"
]

print(f"Total questions for bitpay in the train set: {len(questions)}")
print(json.dumps(questions, indent=4))

with open("./train/bitpay/bitpay.json", "w") as bilt_file:
    json.dump(questions, bilt_file, indent=4)

co_df = pd.DataFrame(questions)
co_df.to_csv("./train/bitpay/bitay.csv", index=False)

print(f"Total questions for bitpay in the train set: {len(questions)}")
print("Saved bilt_train_set.json and bilt_train_set.csv.")


Total questions for bitpay in the train set: 6
[
    {
        "company": "bitpay",
        "Question": "Where can BitPay users exercise their data rights, and why?",
        "Answer": "Main Answer:\nBitPay's Data Subject Rights Portal operates as a centralized hub designed to meet specific user needs and regulatory requirements. The portal serves users through web interfaces, mobile applications, and support channels, providing access points tailored to different user preferences and needs. Its primary purpose is ensuring transparent and efficient rights exercise while maintaining proper verification and documentation standards. The system operates with specific workflows for different request types, maintaining detailed records of all interactions while ensuring consistent request handling and timely responses across all access points.\n\nKey Points:\n\u2022 Multiple access points for user convenience\n\u2022 Purpose-driven request handling system\n\u2022 Structured workflow manageme