# Setup

## Load API Keys

In [1]:
from dotenv import load_dotenv

load_dotenv()

True

## Tracing

In [2]:
from langfuse.openai import AsyncOpenAI  # autoinstrmenttion

## Setup OpenAI

In [3]:
client = AsyncOpenAI()

In [4]:
EMBED_MODEL = "text-embedding-3-large"

In [5]:
GPT4O_MINI = "o4-mini-2025-04-16"

## LLM Call Helpers

In [6]:
def _msg(role, content):
    return {'role': role, 'content': content}

def system(content):
    return _msg('system', content)

def user(content):
    return _msg('user', content)

def assistant(content):
    return _msg('assistant', content)

## Embedding Call Helpers

In [7]:
def get_embedding(e) -> list[float]:
    return e.data[0].embedding

## Compute Cosine Similarity

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from openai.types.create_embedding_response import CreateEmbeddingResponse

In [9]:
def embedding_cosine_sim(e1: CreateEmbeddingResponse, e2: CreateEmbeddingResponse) -> float:
    e1, e2 = get_embedding(e1), get_embedding(e2)
    to_np = lambda e: np.array(e).reshape(1, -1)
    e1, e2 = to_np(e1), to_np(e2)
    _cos_sim = cosine_similarity(e1, e2)
    return _cos_sim[0][0]

## Cache System 

In [10]:
from diskcache import Cache

In [11]:
cache = Cache(directory=".cache_course")

In [12]:
import asyncio

In [13]:
async def set_async(key, val, **kwargs):
    return await asyncio.to_thread(cache.set, key, val, **kwargs)

async def get_async(key, default=None, **kwargs):
    return await asyncio.to_thread(cache.get, key, default, **kwargs)

In [14]:
import json
from hashlib import md5

def make_cache_key(key_name, **kwargs):
    kwargs_string = json.dumps(kwargs, sort_keys=True)
    kwargs_hash = md5(kwargs_string.encode('utf-8')).hexdigest()
    cache_key = f"{key_name}__{kwargs_hash}"
    return cache_key

## [EMBEDDING] Cached and Retried Calls

In [15]:
from pydantic import BaseModel

def _make_key_for_cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        input=input,
        **kwargs
    )

In [16]:
from openai.types.create_embedding_response import CreateEmbeddingResponse
from functools import wraps
from openai import APITimeoutError, RateLimitError
from pydantic import BaseModel
import backoff


CACHE_MISS_SENTINEL = object()


@wraps(client.embeddings.create)
async def cached_embedding_with_retry(
    *,
    model,
    input,
    **kwargs,
) -> CreateEmbeddingResponse:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_embedding_with_retry(
        model=model,
        input=input,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.embeddings.create(
                model=model,
                input=input,
                **kwargs
            )
        embedding = await do_call()
        await set_async(cache_key, embedding.model_dump_json())
        return embedding
    # CACHE HIT
    else:
        embedding = CreateEmbeddingResponse.model_validate(json.loads(cached_value))
        return embedding
        
        

## [LLM] Cached, Retried, and Traced Calls

In [17]:
from pydantic import BaseModel

def _make_key_for_cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: BaseModel,
    **kwargs,
):
    return make_cache_key(
        "openai_parsed_chat",
        model=model,
        messages=messages,
        response_format=response_format.model_json_schema(),
        **kwargs
    )

In [18]:
from openai.types.chat import ParsedChatCompletion
from functools import wraps
from openai import APITimeoutError, RateLimitError
from pydantic import BaseModel
from typing_extensions import TypeVar
import backoff

ResponseFormatT = TypeVar("ResponseFormatT", bound=BaseModel)

CACHE_MISS_SENTINEL = object()


@wraps(client.chat.completions.parse)
async def cached_chat_completion_parsed_with_retry(
    *,
    model,
    messages,
    response_format: ResponseFormatT,
    **kwargs,
) -> ParsedChatCompletion[ResponseFormatT]:
    # CREATE CACHE KEY
    cache_key = _make_key_for_cached_chat_completion_parsed_with_retry(
        model=model,
        messages=messages,
        response_format=response_format,
        **kwargs
    )

    cached_value = await get_async(cache_key, default=CACHE_MISS_SENTINEL)
    # CACHE MISS
    if cached_value is CACHE_MISS_SENTINEL:
        @backoff.on_exception(
            backoff.expo,
            (APITimeoutError, RateLimitError)
        )
        async def do_call():
            return await client.chat.completions.parse(
                model=model,
                messages=messages,
                response_format=response_format,
                **kwargs
            )
        completion = await do_call()
        await set_async(cache_key, completion.model_dump_json())
        return completion
    # CACHE HIT
    else:
        # TODO: Tracing Code (next section)
        # return 
        completion = ParsedChatCompletion.model_validate(json.loads(cached_value))
        for choice in completion.choices:
            if not choice.message.refusal:
                choice.message.parsed = response_format.model_validate(
                    choice.message.parsed
                )
        return completion
        
        

## Sanity Checks

In [19]:
# sanity check
embedding = await cached_embedding_with_retry(
    input="input: 'Union[str, List[str], Iterable[int], Iterable[Iterable[int]]]'",
    model=EMBED_MODEL
)
embedding_cosine_sim(embedding, embedding)

np.float64(0.9999999999999978)

In [20]:
# sanity check
from pydantic import BaseModel

class CalendarEvent(BaseModel):
    name: str
    date: str
    participants: list[str]

completion = await cached_chat_completion_parsed_with_retry(
    model=GPT4O_MINI,
    messages=[
        {"role": "system", "content": "Extract the event information."},
        {"role": "user", "content": "Alice and Bob are going to a science fair on Friday."},
    ],
    response_format=CalendarEvent,
)

event = completion.choices[0].message.parsed
event

CalendarEvent(name='Science Fair', date='Friday', participants=['Alice', 'Bob'])

---

# Synthetic Data Filtering

## Get the embedding for each question

In [21]:
import pandas as pd
import ast

emails = pd.read_csv('paul_allen_sent_email_with_questions_v1.csv')
emails['questions'] = emails['questions'].apply(ast.literal_eval)

In [22]:
def _make_ds(id_message, id_question, question, embedding=None):
    return {
        "id_message": id_message,
        "id_question": id_question,
        "question": question,
        "embedding": embedding
    }

In [23]:
questions_with_embeddings = []
index_for_questions_with_embeddings = {}

In [24]:
for e_idx, email in emails.iterrows():
    index_for_questions_with_embeddings[e_idx] = {}
    for q_idx, question in enumerate(email.questions):
        _ds = _make_ds(
            e_idx, q_idx, question
        )
        index_for_questions_with_embeddings[e_idx][q_idx] = _ds
        questions_with_embeddings.append(_ds)

In [25]:
index_for_questions_with_embeddings[0]

{0: {'id_message': 0,
  'id_question': 0,
  'question': 'What is the bid amount for Sagewood?',
  'embedding': None},
 1: {'id_message': 0,
  'id_question': 1,
  'question': 'What rate is being requested for Sagewood?',
  'embedding': None},
 2: {'id_message': 0,
  'id_question': 2,
  'question': 'What is the term dependent on for the Sagewood bid?',
  'embedding': None},
 3: {'id_message': 0,
  'id_question': 3,
  'question': 'What is the maximum rate mentioned for the Sagewood bid?',
  'embedding': None},
 4: {'id_message': 0,
  'id_question': 4,
  'question': 'Who is the recipient of the email?',
  'embedding': None},
 5: {'id_message': 0,
  'id_question': 5,
  'question': 'What asset is being referenced in the bid?',
  'embedding': None}}

In [26]:
import traceback

tasks = []

for ds in questions_with_embeddings:
    async def do_task(ds):
        try:
            ds['embedding'] = await cached_embedding_with_retry(
                input=ds['question'],
                model=EMBED_MODEL
            )
        except Exception as e:
            return (ds, e, traceback.format_exc())
            
    tasks.append(do_task(ds))

In [27]:
len(tasks)

1153

In [28]:
from tqdm.asyncio import tqdm_asyncio

results = await tqdm_asyncio.gather(*tasks)

100%|██████████| 1153/1153 [00:02<00:00, 479.82it/s]


In [29]:
len([r for r in results if r])

0

# Creating a list of Bad Questions

In [30]:
from operator import itemgetter

def find_exact_duplicates():
    get_question = itemgetter('question')
    questions = map(get_question, questions_with_embeddings)
    seen_questions = {}
    exact_duplicates = []
    for q in questions:
        if q in seen_questions:
            exact_duplicates.append(q)
        else:
            seen_questions[q] = True
    return set(exact_duplicates)

exact_duplicates = find_exact_duplicates()        

In [31]:
exact_duplicates

{"How much are the builders' bids for the house project?",
 'What does Phillip refer to himself as in the email?',
 "What is Phillip's name in the email?",
 'What is the cell phone number to reach Phillip Allen?',
 'What is the email address of Phillip Allen?',
 'What is the fax number provided in the email?',
 'What is the intended shape of the kitchen island?',
 'What is the name of the person addressed in the email?',
 'What is the overall dimension of the house mentioned in the email?',
 'What is the purpose of the email?',
 'What is the subject of the email?',
 'What material does Paul Allen suggest for the exterior of the house?',
 'What style of house is Paul Allen aiming for?',
 'What time is the meeting scheduled for?',
 'What type of roof does Paul Allen want for the house?',
 'When will Phillip talk to Lucy again?',
 'Who is the email addressed to?',
 'Who is the recipient of the email?',
 'Who is the sender of the email?',
 'Who sent the email?'}

In [32]:
bad_questions = list(exact_duplicates)

In [33]:
bad_questions

['Who sent the email?',
 'What is the cell phone number to reach Phillip Allen?',
 'What is the subject of the email?',
 'What is the fax number provided in the email?',
 'What time is the meeting scheduled for?',
 'Who is the recipient of the email?',
 'What style of house is Paul Allen aiming for?',
 'What does Phillip refer to himself as in the email?',
 "What is Phillip's name in the email?",
 'When will Phillip talk to Lucy again?',
 "How much are the builders' bids for the house project?",
 'What type of roof does Paul Allen want for the house?',
 'What is the overall dimension of the house mentioned in the email?',
 'Who is the sender of the email?',
 'What is the purpose of the email?',
 'What is the intended shape of the kitchen island?',
 'What material does Paul Allen suggest for the exterior of the house?',
 'What is the email address of Phillip Allen?',
 'What is the name of the person addressed in the email?',
 'Who is the email addressed to?']

# Find questions generated from the same email that are roughly the same

In [34]:
from itertools import combinations

In [35]:
email_combinations = {}
email_combinations_similarity = {}

for e_id in index_for_questions_with_embeddings:
    email_combinations[e_id] = list(combinations(index_for_questions_with_embeddings[e_id], 2))
    email_combinations_similarity[e_id] = []
    for (q1, q2) in email_combinations[e_id]:
        e1 = index_for_questions_with_embeddings[e_id][q1]['embedding']
        e2 = index_for_questions_with_embeddings[e_id][q2]['embedding']
        email_combinations_similarity[e_id].append(
            embedding_cosine_sim(e1, e2)
        )

In [36]:
from pprint import pprint
likely_duplicates = []

for e_id in email_combinations_similarity:
    for (q1, q2), similarity_score in zip(email_combinations[e_id], email_combinations_similarity[e_id]):
        question1 = index_for_questions_with_embeddings[e_id][q1]['question']
        question2 = index_for_questions_with_embeddings[e_id][q2]['question']
        if similarity_score > 0.8 and question1 != question2:
            likely_duplicates.append((e_id, q1, q2))
            pprint((
                f"{similarity_score:.2f}",
                f"{(e_id, q1)=}, {(e_id, q2)}",
                question1,
                question2,
            ))
            print()

('0.86',
 '(e_id, q1)=(15, 1), (15, 2)',
 'Who is the master user for ft-west after the change?',
 'Who is the master user for im-west after the change?')

('0.83',
 '(e_id, q1)=(20, 1), (20, 2)',
 'Who approved granting executing ids?',
 'To whom were the executing ids granted?')

('0.94',
 '(e_id, q1)=(21, 0), (21, 3)',
 'What time will the updated curves be sent?',
 'What date will the updated curves be sent?')

('0.81',
 '(e_id, q1)=(23, 1), (23, 3)',
 'what is being discussed in terms of kim olinger?',
 'which individual is mentioned in relation to kim olinger?')

('0.81',
 '(e_id, q1)=(52, 1), (52, 2)',
 'What time is the inspection set for?',
 'What day is the inspection scheduled?')

('0.87',
 '(e_id, q1)=(65, 1), (65, 4)',
 'What does Phillip consider for owner financing?',
 'What are the terms that Phillip is considering for owner financing?')

('0.80',
 '(e_id, q1)=(66, 0), (66, 5)',
 'What products are routed to nw basis?',
 'Which products are mentioned in relation to nw b

In [37]:
TO_KEEP = [
    '(e_id, q1)=(277, 2), (277, 3)'
] # above 0.83

TO_DELETE = [
    
]

In [38]:
pprint(set(bad_questions))

{"How much are the builders' bids for the house project?",
 'What does Phillip refer to himself as in the email?',
 "What is Phillip's name in the email?",
 'What is the cell phone number to reach Phillip Allen?',
 'What is the email address of Phillip Allen?',
 'What is the fax number provided in the email?',
 'What is the intended shape of the kitchen island?',
 'What is the name of the person addressed in the email?',
 'What is the overall dimension of the house mentioned in the email?',
 'What is the purpose of the email?',
 'What is the subject of the email?',
 'What material does Paul Allen suggest for the exterior of the house?',
 'What style of house is Paul Allen aiming for?',
 'What time is the meeting scheduled for?',
 'What type of roof does Paul Allen want for the house?',
 'When will Phillip talk to Lucy again?',
 'Who is the email addressed to?',
 'Who is the recipient of the email?',
 'Who is the sender of the email?',
 'Who sent the email?'}


# Let's find all bad questions that look alike using Embedding

In [39]:
bad_questions.extend([
    'What is the email address of Phillip Allen?',
    'What is the phone number of Phillip Allen?',
    "What is Phillip Allen's work phone number?",
    'What is the cell phone number for Phillip Allen?',
    'What day is the meeting scheduled for?',
    'What time is the meeting scheduled for?',
    'who is the sender of the email?',
    "what is the sender's name as mentioned in the email?"
])

In [40]:
bad_question_embeddings = await tqdm_asyncio.gather(*[
    cached_embedding_with_retry(input=bad_question, model=EMBED_MODEL)
    for bad_question in set(bad_questions)
])

100%|██████████| 26/26 [00:00<00:00, 545.04it/s]


In [41]:
bad_questions_set = set(bad_questions)

In [42]:
questions_to_delete = set()
questions_to_review = []

for question in questions_with_embeddings:
    if question['question'] in bad_questions_set:
        questions_to_delete.add((question['id_message'], question['id_question']))
    else:
        q_embed = question['embedding']
        for i, bad_question in enumerate(bad_questions_set):
            cos_sim = embedding_cosine_sim(bad_question_embeddings[i], q_embed)
            if cos_sim > 0.8:
                questions_to_review.append(question)
                continue

In [43]:
len(questions_to_review)

33

In [44]:
def pick_field_from_dict(dict_, keys: set[str | int]):
    return {k:v for k, v in dict_.items() if k in keys}

In [45]:
_questions_to_review = [pick_field_from_dict(q, {'id_message', 'id_question', 'question'}) for q in questions_to_review]

In [46]:
pprint(_questions_to_review)

[{'id_message': 32,
  'id_question': 4,
  'question': "What is Phillip's name mentioned in the email?"},
 {'id_message': 40,
  'id_question': 3,
  'question': 'What is the email address for Phillip Allen?'},
 {'id_message': 40,
  'id_question': 3,
  'question': 'What is the email address for Phillip Allen?'},
 {'id_message': 42,
  'id_question': 5,
  'question': 'Who is the recipient of this email?'},
 {'id_message': 42,
  'id_question': 5,
  'question': 'Who is the recipient of this email?'},
 {'id_message': 51,
  'id_question': 5,
  'question': 'What is the primary purpose of the email?'},
 {'id_message': 99,
  'id_question': 2,
  'question': 'What time is it for the meeting?'},
 {'id_message': 99,
  'id_question': 4,
  'question': 'When is the meeting scheduled for?'},
 {'id_message': 99,
  'id_question': 4,
  'question': 'When is the meeting scheduled for?'},
 {'id_message': 129,
  'id_question': 2,
  'question': 'Who is Phillip addressing in the email?'},
 {'id_message': 139,
  'i

In [47]:
{
    "id_message": 246,
    "id_question": 2,
    "question": 'Where is Phillip Allen located?'
}
{
    'id_message': 226,
    'id_question': 0,
    'question': 'What is the home number to call Phillip Allen with questions?'}

{'id_message': 226,
 'id_question': 0,
 'question': 'What is the home number to call Phillip Allen with questions?'}

In [48]:
_questions_to_delete_after_review = [q for q in _questions_to_review if (q['id_message'], q['id_question']) in {(246, 2), (226, 0)}]

In [49]:
_questions_to_delete_after_review

[{'id_message': 226,
  'id_question': 0,
  'question': 'What is the home number to call Phillip Allen with questions?'},
 {'id_message': 226,
  'id_question': 0,
  'question': 'What is the home number to call Phillip Allen with questions?'},
 {'id_message': 226,
  'id_question': 0,
  'question': 'What is the home number to call Phillip Allen with questions?'},
 {'id_message': 226,
  'id_question': 0,
  'question': 'What is the home number to call Phillip Allen with questions?'},
 {'id_message': 246,
  'id_question': 2,
  'question': 'Where is Phillip Allen located?'}]

In [50]:
questions_to_delete.update({(question['id_message'], question['id_question']) for question in _questions_to_delete_after_review})

In [51]:
len(questions_to_delete)

77

In [52]:
questions_to_improve = [
    'which individual is mentioned in relation to kim olinger?', # no information about the project; too vague
    'What does Phillip consider for owner financing?', # Phillip would not speak at the 3rd person
    'What does Phillip consider for owner financing?',
    'What are the terms that Phillip is considering for owner financing?',
    'What would the 6 month payout be per month?', # too vague
    'What formula should be used in column e?', # missing information
    'What should be the formula in column h?', # missing information
    'What is the price Phillip is prepared to pay for sagewood?', # Phillip would not speak at the 3rd person
    'Does Phillip indicate he is prepared to pay more than 2.7 for sagewood?', # Phillip would not speak at the 3rd person
    "How much higher is Phillip Allen's bid per square foot compared to Reagan's units under construction?", # Phillip would not speak at the 3rd person
    'What is the price per square foot that Phillip Allen is bidding compared to the units sold by Reagan?', # Phillip would not speak at the 3rd person
    'What does Phillip want to know about the investment dollars timeline?', # Phillip would not speak at the 3rd person
    'What does Phillip want to get a feel for regarding the investment dollars?', # Phillip would not speak at the 3rd person
    'What day and time did Phillip suggest for the meeting?', # Phillip would not speak at the 3rd person; not specific enough
    'What specific time did Phillip propose for the meeting?', # Phillip would not speak at the 3rd person; not specific enough
    'Which tenant missed rent on 1/26?', # the exact date should rather be a range of date or a month 
    'Which tenant missed rent on 2/2?', # the exact date should rather be a range of date or a month
]

# Verifying duplicates

## Prompt Template

In [53]:
from pydantic import BaseModel
from enum import Enum

class DuplicateVerdictEnum(Enum):
    is_duplicate = "same"
    is_not_duplicate = "different"

class DuplicateVerdict(BaseModel):
    concise_reasoning: str
    verdict: DuplicateVerdictEnum

In [54]:
from textwrap import dedent
from jinja2 import Template

_system_prompt_template = Template(dedent(
    """\
    Your task is to assess if 2 questions are close enough to have the same answers.

    You'll answer in JSON by respecting the following schema:
    ```ts
    {
        concise_reasoning: str
        verdict: "{{ is_duplicate }}" | "{{ is_not_duplicate }}"
    }
    ```
    """
))

In [55]:
_check_system_prompt = _system_prompt_template.render(
    is_duplicate=DuplicateVerdictEnum.is_duplicate.value,
    is_not_duplicate=DuplicateVerdictEnum.is_not_duplicate.value,
)

In [56]:
prompt_template = Template(dedent(
    """\
    Do those two questions are likely to have the same answer and can be considered duplicates of each other?

    <question 1>
    {{ question_1 }}
    </question 1>

    <question 2>
    {{ question_2 }}
    </question 2>
    
    First concisely reason step by step, then provide your final verdict.
    """
))

In [57]:
_check_prompt = prompt_template.render(
    question_1="What is the phone number of Paul Allen for contact?",
    question_2="What is the cell phone number of Paul Allen?"
)

## LLM

In [58]:
LLM_MODEL = GPT4O_MINI

_check_result = await cached_chat_completion_parsed_with_retry(
    model=LLM_MODEL,
    messages=[
        system(_check_system_prompt),
        user(_check_prompt),
    ],
    max_completion_tokens=2000,
    response_format=DuplicateVerdict
)

In [59]:
_check_result.choices[0].message.parsed

DuplicateVerdict(concise_reasoning='The first asks for a general contact phone number (which could be an office or main line), while the second specifically asks for a cell phone number. They likely refer to different numbers.', verdict=<DuplicateVerdictEnum.is_not_duplicate: 'different'>)

In [60]:
len(likely_duplicates)

32

In [61]:
tasks = []

for (e_id, q1_id, q2_id) in likely_duplicates:
    question_1 = index_for_questions_with_embeddings[e_id][q1_id]['question']
    question_2 = index_for_questions_with_embeddings[e_id][q2_id]['question']
    prompt = prompt_template.render(
        question_1=question_1,
        question_2=question_2
    )
    
    task = cached_chat_completion_parsed_with_retry(
        model=LLM_MODEL,
        messages=[
            system(_check_system_prompt),
            user(prompt),
        ],
        max_completion_tokens=2000,
        response_format=DuplicateVerdict
    )
    tasks.append(task)

In [62]:
results = await tqdm_asyncio.gather(*tasks)

100%|██████████| 32/32 [00:00<00:00, 540.49it/s]


In [72]:
"""
counter = 0

for (e_id, q1_id, q2_id), result in zip(likely_duplicates, results):
    result = result.choices[0].message.parsed
    if result.verdict == DuplicateVerdictEnum.is_duplicate:
        question_1 = index_for_questions_with_embeddings[e_id][q1_id]['question']
        question_2 = index_for_questions_with_embeddings[e_id][q2_id]['question']
        print(
            (e_id, q1_id, q2_id), '\n',
            question_1, '\n',
            question_2, '\n',
            result.concise_reasoning, '\n',
            '\n'
        )
        counter += 1
"""

(65, 1, 4) 
 What does Phillip consider for owner financing? 
 What are the terms that Phillip is considering for owner financing? 
 Both questions ask for the specific terms Phillip is considering in an owner-financing arrangement, so they seek the same information. 
 

(66, 0, 5) 
 What products are routed to nw basis? 
 Which products are mentioned in relation to nw basis? 
 Both questions seek the set of products associated with “nw basis” and would yield the same answer. 
 

(150, 0, 3) 
 What is the price per duplex for the 22 units sold by Reagan Lehmann? 
 What is the price per duplex for the units that Reagan is selling? 
 Both questions ask for the price per duplex set by Reagan for her units; the only difference is referring to “22 units sold” vs. “units she’s selling,” but they seek the identical figure. 
 

(185, 2, 3) 
 who is the sender of the email? 
 what is the sender's name as mentioned in the email? 
 Both ask for the identity of the email sender and would yield the

In [73]:
counter

5

In [75]:
for (e_id, q1_id, q2_id), result in zip(likely_duplicates, results):
    result = result.choices[0].message.parsed
    if result.verdict == DuplicateVerdictEnum.is_duplicate:
        questions_to_delete.add(
            (e_id, q2_id)
        )

In [82]:
print(f"Pourcentage of questions to delete :{len(questions_to_delete) / len(questions_with_embeddings) * 100: .2f} %", "")

Pourcentage of questions to delete : 7.03 % 


In [84]:
deduplicated_questions = []

for each in questions_with_embeddings:
    e_id = each['id_message']
    q_id = each['id_question']
    question = each['question']
    if (e_id, q_id) not in questions_to_delete:
        e_id = emails.iloc[e_id]['Message-ID']
        deduplicated_questions.append((e_id, question))

In [88]:
import json

with open('deduplicated_questions.json', 'w') as f:
    json.dump(deduplicated_questions, f)

In [89]:
with open('deduplicated_questions.json', 'r') as f:
    data = json.load(f)

In [90]:
data

[['<13537630.1075855669909.JavaMail.evans@thyme',
  'What is the bid amount for Sagewood?'],
 ['<13537630.1075855669909.JavaMail.evans@thyme',
  'What rate is being requested for Sagewood?'],
 ['<13537630.1075855669909.JavaMail.evans@thyme',
  'What is the term dependent on for the Sagewood bid?'],
 ['<13537630.1075855669909.JavaMail.evans@thyme',
  'What is the maximum rate mentioned for the Sagewood bid?'],
 ['<13537630.1075855669909.JavaMail.evans@thyme',
  'What asset is being referenced in the bid?'],
 ['<21846431.1075855673215.JavaMail.evans@thyme',
  'Who are the two analysts mentioned in the email?'],
 ['<21846431.1075855673215.JavaMail.evans@thyme',
  'Who will represent Matt Lenhart?'],
 ['<21846431.1075855673215.JavaMail.evans@thyme',
  'In what group did Vishal Apte work for the first 9 months?'],
 ['<21846431.1075855673215.JavaMail.evans@thyme',
  "Who did Paul Allen speak to regarding Vishal's performance?"],
 ['<21846431.1075855673215.JavaMail.evans@thyme',
  'What categ