# Creating a Sample Dataset with LangChain

## Data without AI

### Imports

In [2]:
import random
import json
from datetime import timedelta

import pandas as pd
from faker import Faker

In [3]:
# Overall Configuration
SEED = 42
NUM_ENTRIES = 500

fake = Faker(SEED)
random.seed(SEED)

In [4]:
SERVICE_CHANNELS = ['email', 'chatbot', 'WhatsApp', 'phone', 'web form']
SERVICE_TYPES = ['technical support', 'complaint', 'inquiry', 'quote request']
SERVICE_CATEGORIES = ['financial', 'technical', 'commercial']
SERVICE_STATUSES = ['open', 'in progress', 'resolved', 'pending', 'canceled']

class Service():
    def __init__(self):
        self.service_id = fake.uuid4()
        self.date_time_of_service = fake.date_time_this_year()
        self.service_channel = random.choice(SERVICE_CHANNELS)
        self.problem_description = None
        self.service_category = random.choice(SERVICE_CATEGORIES)
        
        match self.service_category:
            case 'financial':
                self.service_type = 'quote request'
            case 'technical':
                self.service_type = 'technical support'
            case 'commercial':
                self.service_type = random.choice(['complaint', 'inquiry'])

        self.service_status = random.choice(SERVICE_STATUSES)

In [5]:
CUSTOMER_TYPES = ['individual', 'business']

class Customer():
    def __init__(self, service: Service):
        self.customer_id = fake.uuid4()
        self.customer_type = random.choice(CUSTOMER_TYPES)
        self.customer_name = fake.name() if self.customer_type == 'individual' else fake.company()
        self.phone_number = fake.phone_number()
        self.address = fake.address() if service.service_category == 'technical support' else None

In [6]:
class ServiceRepresentative():
    def __init__(self, service: Service):
        match service.service_category:
            case 'financial':
                self.name = random.choice(['Esther Dagir', 'Sarah Pimenta'])
                self.department = 'finance'
            case 'technical':
                self.name = 'Gabriel Onishi'
                self.department = 'support'
            case 'commercial':
                self.name = 'Sarah Pimenta'
                self.department = 'sales'
            case _:
                raise ValueError

In [7]:
RATINGS = [0, 1, 2, 3, 4, 5]
MAX_DAYS_AFTER_SERVICE = 365

class SolutionAndFeedback():
    def __init__(self, service: Service):
        self.applied_solution = None
        self.completion_date = None
        self.satisfactory_rating = None
        self.customer_comment = None
        if service.service_status in ['resolved', 'canceled']:
            self.completion_date = fake.date_between_dates(service.date_time_of_service, service.date_time_of_service + timedelta(MAX_DAYS_AFTER_SERVICE))
            if service.service_status == "resolved":
                self.satisfactory_rating = random.choice(RATINGS)

In [31]:
class DataEntry():
    def __init__(self):
        self.service = Service()
        self.customer = Customer(self.service)
        self.solution_and_feedback = SolutionAndFeedback(self.service)
    
    def dump_json(self):
        def flatten(obj):
            flat = {}
            if isinstance(obj, (str, int, float, bool)) or obj is None:
                return flat  # Skip top-level primitives
            elif isinstance(obj, list):
                for item in obj:
                    flat.update(flatten(item))
            elif hasattr(obj, '__dict__'):
                for k, v in obj.__dict__.items():
                    if isinstance(v, (str, int, float, bool)) or v is None:
                        flat[k] = v
                    else:
                        flat.update(flatten(v))
            return flat
        return json.dumps(flatten(self), indent=4)

In [9]:
DataEntry().dump_json()

'{\n  "service_id": "1358c2e7-adfc-420b-843d-b1f66aa8623f",\n  "service_channel": "email",\n  "problem_description": null,\n  "service_category": "financial",\n  "service_type": "quote request",\n  "service_status": "resolved",\n  "customer_id": "57db3ce9-e3c7-400b-99e7-b97d82d87832",\n  "customer_type": "individual",\n  "customer_name": "Brenda Anderson",\n  "phone_number": "954-481-6481",\n  "address": null,\n  "applied_solution": null,\n  "satisfactory_rating": 1,\n  "customer_comment": null\n}'

In [10]:
entries_list = [DataEntry() for _ in range(NUM_ENTRIES)]

In [11]:
services = set()
for entry in entries_list:
    if entry.service.service_status == 'resolved':
        t = json.loads(entry.dump_json())
        break

t

{'service_id': '7452431b-d143-4a24-947a-6c02a3b13b06',
 'service_channel': 'WhatsApp',
 'problem_description': None,
 'service_category': 'technical',
 'service_type': 'technical support',
 'service_status': 'resolved',
 'customer_id': '5630d636-d219-4f23-95c5-f5caeb5c84f8',
 'customer_type': 'individual',
 'customer_name': 'Kevin Colon',
 'phone_number': '001-357-551-3056x8605',
 'address': None,
 'applied_solution': None,
 'satisfactory_rating': 2,
 'customer_comment': None}

## LangChain

In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field

In [13]:
# Settings
TEMPERATURE = 0.7
MODEL = "gemini-1.5-flash"

In [14]:
llm = ChatGoogleGenerativeAI(
    model=MODEL,
    temperature=TEMPERATURE,
)

In [15]:
resolved_prompt = ChatPromptTemplate.from_template('''\
Create fake data for a dataset. Based on the information about the customer review \
provide the following information:
 - problem_description: a short text describing the problem
 - applied_solution: a short text describing the solution
 - customer_comment: a short comment from the customer

Customer review information:
 - service_channel: {service_channel} 
 - service_category: {service_category} 
 - service_type: {service_type} 
 - satisfactory_rating: {satisfactory_rating}\
''')

class ResolvedOutput(BaseModel):
    problem_description: str = Field(description='a short text describing the problem')
    applied_solution: str = Field(description='a short text describing the solution')
    customer_comment: str = Field(description='a short comment from the customer')

resolved_llm = llm.with_structured_output(ResolvedOutput)

resolved_chain = resolved_prompt | resolved_llm

In [16]:
unresolved_prompt = ChatPromptTemplate.from_template('''\
Create fake data for a dataset. Based on the information about the customer review \
provide the following information:
 - problem_description: a short text describing the problem

Customer review information:
 - service_channel: {service_channel} 
 - service_category: {service_category} 
 - service_type: {service_type}\
''')

class UnresolvedOutput(BaseModel):
    problem_description: str = Field(description='a short text describing the problem')

unresolved_llm = llm.with_structured_output(UnresolvedOutput)

unresolved_chain = unresolved_prompt | unresolved_llm

In [17]:
t = {'t': 123, 'j': 'asdf'}

print({**t})

{'t': 123, 'j': 'asdf'}


In [25]:
full_entries = []
resolved_info = None
for entry in entries_list:
    if entry.service.service_status == 'resolved':
        review_info = {
            'service_channel': entry.service.service_channel,
            'service_category': entry.service.service_category,
            'service_type': entry.service.service_type,
            'satisfactory_rating': entry.solution_and_feedback.satisfactory_rating
        }
        resolved_info = resolved_chain.invoke(review_info)
        if isinstance(resolved_info, ResolvedOutput):
            entry.service.problem_description = resolved_info.problem_description
            entry.solution_and_feedback.applied_solution = resolved_info.applied_solution
            entry.solution_and_feedback.customer_comment = resolved_info.customer_comment
        
    else:
        review_info = {
            'service_channel': entry.service.service_channel,
            'service_category': entry.service.service_category,
            'service_type': entry.service.service_type
        }
        unresolved_info = unresolved_chain.invoke(review_info)
        if isinstance(unresolved_info, UnresolvedOutput):
            entry.service.problem_description = unresolved_info.problem_description

    full_entries.append(entry)

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 55
}
].


ResourceExhausted: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. [violations {
  quota_metric: "generativelanguage.googleapis.com/generate_content_free_tier_requests"
  quota_id: "GenerateRequestsPerMinutePerProjectPerModel-FreeTier"
  quota_dimensions {
    key: "model"
    value: "gemini-1.5-flash"
  }
  quota_dimensions {
    key: "location"
    value: "global"
  }
  quota_value: 15
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
, retry_delay {
  seconds: 53
}
]

In [None]:
# with open('db.jsonl', 'w') as f:
#     f.write('[\n')
#     for entry in full_entries:
#         f.write(entry.dump_json() + ',\n')
#     f.write(']')

# Turn the entries into a jsonl file
with open('db.jsonl', 'w') as f:
    f.write('[\n')
    for entry in full_entries[:-1]:
        f.write(entry.dump_json() + ',\n')
    f.write(full_entries[-1].dump_json() + '\n')  # Last entry without a comma
    f.write(']')

In [None]:
prompt_template = ChatPromptTemplate.from_messages