In [None]:
import pandas as pd
import random
import re

# === ICP Definitions ===
icps = [
    {
        "industry": ["Healthcare Tech", "MedTech", "AI in Healthcare", "Wearable Tech"],
        "engagement_rate": "65-95",
        "company_size_employees": "100–800",
        "annual_revenue_usd": "10M–40M",
        "headquarters_location": "India",
        "technology_stack": ["Python", "AWS", "Kubernetes", "TensorFlow", "Edge AI", "FHIR", "IoT"],
        "target_designations": ["Chief Medical Officer", "CTO", "Head of AI", "Director of Product"],
        "pain_points": [
            "Medical device integration", "Data privacy compliance", "AI model explainability", "Real-time patient monitoring"
        ]
    },
    {
        "industry": ["FinTech", "Banking Tech", "Payments", "Blockchain"],
        "engagement_rate": "70-100",
        "company_size_employees": "300–2000",
        "annual_revenue_usd": "20M–100M",
        "headquarters_location": "India",
        "technology_stack": ["Blockchain", "React", "Node.js", "AWS", "Kafka", "Python", "Microservices"],
        "target_designations": ["Head of Payments", "VP of Engineering", "CTO", "Product Director"],
        "pain_points": [
            "Transaction latency", "Regulatory complexity", "Fraud detection automation", "Blockchain scalability"
        ]
    },
    {
        "industry": ["Manufacturing", "Industrial Automation", "IoT", "Robotics"],
        "engagement_rate": "55-85",
        "company_size_employees": "500–5000",
        "annual_revenue_usd": "50M–500M",
        "headquarters_location": "India",
        "technology_stack": ["IoT", "SCADA", "Edge Computing", "ROS", "C++", "Python", "Azure"],
        "target_designations": ["Operations Head", "VP of Engineering", "Automation Lead", "Chief Digital Officer"],
        "pain_points": [
            "Predictive maintenance", "Factory automation", "Legacy system modernization", "Data interoperability"
        ]
    },
    {
        "industry": ["Gaming", "Entertainment Tech", "AR/VR", "Cloud Gaming"],
        "engagement_rate": "80-100",
        "company_size_employees": "50–500",
        "annual_revenue_usd": "5M–50M",
        "headquarters_location": "India",
        "technology_stack": ["Unity", "Unreal Engine", "C++", "AWS", "Kubernetes", "WebRTC", "VR/AR SDKs"],
        "target_designations": ["CTO", "VP of Product", "Head of Game Development", "Lead Engineer"],
        "pain_points": [
            "Low latency streaming", "Cross-platform performance", "Scalability under peak load", "Monetization challenges"
        ]
    },
    {
        "industry": ["Logistics Tech", "Supply Chain", "Mobility", "Fleet Management"],
        "engagement_rate": "60-90",
        "company_size_employees": "200–1500",
        "annual_revenue_usd": "15M–80M",
        "headquarters_location": "India",
        "technology_stack": ["GPS Tracking", "IoT", "Java", "React", "AWS", "PostgreSQL", "Microservices"],
        "target_designations": ["VP of Operations", "CTO", "Fleet Manager", "Product Head"],
        "pain_points": [
            "Route optimization", "Asset tracking visibility", "Fuel cost management", "Predictive maintenance"
        ]
    }
]

# === Utility Functions ===
def parse_range(text):
    text = str(text).strip().replace("–", "-").replace(" ", "")
    if "M+" in text:
        base = float(text.replace("M+", "")) * 1e6
        return int(base), int(1e9)
    elif "M" in text:
        nums = [float(x) * 1e6 for x in re.findall(r'[\d.]+', text)]
    elif "-" in text:
        nums = [int(x) for x in re.findall(r'\d+', text)]
    elif text.isdigit():
        return int(text), int(text)
    else:
        return 0, int(1e9)
    return int(nums[0]), int(nums[-1]) if len(nums) > 1 else int(nums[0])

def distort_text(text):
    if random.random() < 0.3:
        return text.lower()
    if random.random() < 0.2:
        return text.upper()
    return text

def add_noise_to_list(items, pool, chance=0.4):
    out = []
    for item in items:
        if random.random() < chance:
            out.append(distort_text(item))
        else:
            out.append(item)
    if random.random() < 0.3:
        out.append(random.choice(pool))
    return list(set(out))

# === Name & Location Data ===
first_names = ["Amit", "Sneha", "Raj", "Priya", "Anil", "Kiran", "Deepak", "Ritu", "Vivek", "Neha"]
last_names = ["Sharma", "Verma", "Patel", "Reddy", "Iyer", "Nair", "Kumar", "Das", "Joshi", "Singh"]
cities_states = [("Bangalore", "Karnataka"), ("Mumbai", "Maharashtra"), ("Hyderabad", "Telangana"),
                 ("Chennai", "Tamil Nadu"), ("Pune", "Maharashtra"), ("Gurgaon", "Haryana")]

extra_tech = ["COBOL", "VBScript", "Fortran", "Delphi", "Lisp"]
extra_titles = ["Intern", "QA Tester", "Business Analyst", "Jr Engineer"]
extra_pain = ["UI bugs", "Legacy tools", "Slow hiring", "Low adoption"]

# === Data Generator ===
def generate_contact(icp=None, match_ratio=0.5):
    first = random.choice(first_names)
    last = random.choice(last_names)
    city, state = random.choice(cities_states)
    designation = random.choice(icp["target_designations"]) if icp and random.random() < match_ratio else random.choice(extra_titles)
    seniority = designation.split()[0] if " " in designation else "Manager"
    departments = "Engineering"
    company = "Company" + str(random.randint(100, 999))
    size = random.randint(*parse_range(icp["company_size_employees"])) if icp and random.random() < match_ratio else random.randint(10, 10000)
    industry = random.choice(icp["industry"]) if icp and random.random() < match_ratio else "Retail"
    tech_stack = icp["technology_stack"] if icp else extra_tech
    techs = add_noise_to_list(random.sample(tech_stack, k=min(3, len(tech_stack))), extra_tech)
    keywords = random.sample(techs, k=min(3, len(techs)))
    address = f"{random.randint(1, 200)} Tech Road, {city}, {state}, India"
    phone = f"+91 {random.randint(7000000000, 9999999999)}"
    revenue = random.randint(*parse_range(icp["annual_revenue_usd"])) if icp else random.randint(1_000_000, 500_000_000)
    funding = random.randint(1_000_000, revenue // 2)
    latest_funding = random.randint(500_000, funding)
    pains = icp["pain_points"] if icp else extra_pain
    pain_points = add_noise_to_list(random.sample(pains, 2), extra_pain)
    engagement = random.randint(*parse_range(icp["engagement_rate"])) if icp else random.randint(10, 60)
    icp_flags = [0] * 5
    if icp and match_ratio >= 0.6:
        icp_flags[icps.index(icp)] = 1

    return [
        first, last, designation, company, seniority, departments, size, industry, ", ".join(keywords),
        city, state, "India", address, phone, ", ".join(techs), revenue, funding, latest_funding,
        ", ".join(pain_points), engagement, *icp_flags
    ]

# === Dataset Creation ===
data = []
for icp in icps:
    for _ in range(30):  # Strong match
        data.append(generate_contact(icp, match_ratio=0.9))
    for _ in range(20):  # Weak match
        data.append(generate_contact(icp, match_ratio=0.4))
for _ in range(50):  # Non-ICP
    data.append(generate_contact(None, match_ratio=0.0))

columns = [
    "first_name", "last_name", "title", "company", "seniority", "departments", "company_size_employees",
    "industry", "keywords", "city", "state", "country", "company_address", "company_phone", "technologies",
    "annual_revenue_usd", "total_funding_usd", "latest_funding_amount_usd", "pain_points", "engagement_rate",
    "ICP1", "ICP2", "ICP3", "ICP4", "ICP5"
]

df = pd.DataFrame(data, columns=columns)
df.to_csv("noisy_icp_dataset.csv", index=False)
print("✅ Noisy ICP dataset saved to 'noisy_icp_dataset.csv'")

✅ Noisy ICP dataset saved to 'noisy_icp_dataset.csv'
