In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import scipy
import re

In [7]:
icps = [
    {
        "industry": ["Healthcare Tech", "MedTech", "AI in Healthcare", "Wearable Tech"],
        "engagement_rate": "65-95",
        "company_size_employees": "100–800",
        "annual_revenue_usd": "10M–40M",
        "headquarters_location": "India",
        "technology_stack": ["Python", "AWS", "Kubernetes", "TensorFlow", "Edge AI", "FHIR", "IoT"],
        "target_designations": ["Chief Medical Officer", "CTO", "Head of AI", "Director of Product"],
        "pain_points": [
            "Medical device integration", "Data privacy compliance", "AI model explainability", "Real-time patient monitoring"
        ]
    },
    {
        "industry": ["FinTech", "Banking Tech", "Payments", "Blockchain"],
        "engagement_rate": "70-100",
        "company_size_employees": "300–2000",
        "annual_revenue_usd": "20M–100M",
        "headquarters_location": "India",
        "technology_stack": ["Blockchain", "React", "Node.js", "AWS", "Kafka", "Python", "Microservices"],
        "target_designations": ["Head of Payments", "VP of Engineering", "CTO", "Product Director"],
        "pain_points": [
            "Transaction latency", "Regulatory complexity", "Fraud detection automation", "Blockchain scalability"
        ]
    },
    {
        "industry": ["Manufacturing", "Industrial Automation", "IoT", "Robotics"],
        "engagement_rate": "55-85",
        "company_size_employees": "500–5000",
        "annual_revenue_usd": "50M–500M",
        "headquarters_location": "India",
        "technology_stack": ["IoT", "SCADA", "Edge Computing", "ROS", "C++", "Python", "Azure"],
        "target_designations": ["Operations Head", "VP of Engineering", "Automation Lead", "Chief Digital Officer"],
        "pain_points": [
            "Predictive maintenance", "Factory automation", "Legacy system modernization", "Data interoperability"
        ]
    },
    {
        "industry": ["Gaming", "Entertainment Tech", "AR/VR", "Cloud Gaming"],
        "engagement_rate": "80-100",
        "company_size_employees": "50–500",
        "annual_revenue_usd": "5M–50M",
        "headquarters_location": "India",
        "technology_stack": ["Unity", "Unreal Engine", "C++", "AWS", "Kubernetes", "WebRTC", "VR/AR SDKs"],
        "target_designations": ["CTO", "VP of Product", "Head of Game Development", "Lead Engineer"],
        "pain_points": [
            "Low latency streaming", "Cross-platform performance", "Scalability under peak load", "Monetization challenges"
        ]
    },
    {
        "industry": ["Logistics Tech", "Supply Chain", "Mobility", "Fleet Management"],
        "engagement_rate": "60-90",
        "company_size_employees": "200–1500",
        "annual_revenue_usd": "15M–80M",
        "headquarters_location": "India",
        "technology_stack": ["GPS Tracking", "IoT", "Java", "React", "AWS", "PostgreSQL", "Microservices"],
        "target_designations": ["VP of Operations", "CTO", "Fleet Manager", "Product Head"],
        "pain_points": [
            "Route optimization", "Asset tracking visibility", "Fuel cost management", "Predictive maintenance"
        ]
    }
]


icp_templates = {f"ICP{i+1}": icp for i, icp in enumerate(icps)}


In [8]:
import re

def parse_range(text):
    text = str(text).strip().replace("–", "-").replace(" ", "")
    if "M+" in text:
        base = float(text.replace("M+", "")) * 1e6
        return (base, float('inf'))
    elif "M" in text:
        nums = [float(x) * 1e6 for x in re.findall(r'[\d.]+', text)]
    elif "-" in text:
        nums = [int(x) for x in re.findall(r'\d+', text)]
    elif text.isdigit():
        return (int(text), int(text))
    else:
        return (0, float('inf'))

    if len(nums) == 1:
        return (nums[0], nums[0])
    return (nums[0], nums[-1])

def range_overlap(range1, range2):
    low = max(range1[0], range2[0])
    high = min(range1[1], range2[1])
    if low > high:
        return 0
    return (high - low) / (max(range1[1], range2[1]) - min(range1[0], range2[0]) + 1e-6)

def jaccard(list1, list2):
    set1 = set([x.strip().lower() for x in list1])
    set2 = set([x.strip().lower() for x in list2])
    if not set1 or not set2:
        return 0
    return len(set1 & set2) / len(set1 | set2)

def match_icp(input_icp, icp_templates):
    best_match = None
    best_score = -1

    for icp_name, icp in icp_templates.items():
        industry_score = jaccard(input_icp.get("industry", []), icp.get("industry", []))
        size_score = range_overlap(
            parse_range(input_icp.get("company_size_employees", "")),
            parse_range(icp.get("company_size_employees", ""))
        )
        revenue_score = range_overlap(
            parse_range(input_icp.get("annual_revenue_usd", "")),
            parse_range(icp.get("annual_revenue_usd", ""))
        )
        tech_score = jaccard(input_icp.get("technology_stack", []), icp.get("technology_stack", []))
        title_score = jaccard(input_icp.get("target_designations", []), icp.get("target_designations", []))
        country_score = 1 if input_icp.get("headquarters_location", "").lower() == icp.get("headquarters_location", "").lower() else 0
        engagement_score = range_overlap(
            parse_range(input_icp.get("engagement_rate", "")),
            parse_range(icp.get("engagement_rate", ""))
        )
        pain_score = jaccard(input_icp.get("pain_points", []), icp.get("pain_points", []))

        total_score = (industry_score + size_score + revenue_score + tech_score +
                       title_score + country_score + engagement_score + pain_score) / 8

        if total_score > best_score:
            best_score = total_score
            best_match = icp_name

    return best_match, round(best_score, 3)

In [9]:

def parse_list_column(col):
    return [t.strip() for t in str(col).split(",") if t.strip()]

def clean_text(s):
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s)
    return s.strip()

def train_and_get_icp_contacts(csv_path, icp_column="ICP3"):
    df = pd.read_csv(csv_path)
    df.fillna("", inplace=True)

    df["text_features"] = (
        df["title"].apply(clean_text) + " " +
        df["industry"].apply(clean_text) + " " +
        df["departments"].apply(clean_text) + " " +
        df["seniority"].apply(clean_text) + " " +
        df["city"].apply(clean_text) + " " +
        df["state"].apply(clean_text) + " " +
        df["country"].apply(clean_text)
    )

    texts = df["text_features"].tolist()
    techs = df["technologies"].apply(parse_list_column).tolist()
    keywords = df["keywords"].apply(parse_list_column).tolist()
    pain_points = df["pain_points"].apply(parse_list_column).tolist()

    numerics = df[["company_size_employees", "annual_revenue_usd", "total_funding_usd", "latest_funding_amount_usd", "engagement_rate"]].copy()
    numerics["company_size_employees"] = pd.to_numeric(numerics["company_size_employees"], errors="coerce")
    numerics["annual_revenue_usd"] = pd.to_numeric(numerics["annual_revenue_usd"], errors="coerce")
    numerics["total_funding_usd"] = pd.to_numeric(numerics["total_funding_usd"], errors="coerce")
    numerics["latest_funding_amount_usd"] = pd.to_numeric(numerics["latest_funding_amount_usd"], errors="coerce")
    numerics["engagement_rate"] = pd.to_numeric(numerics["engagement_rate"], errors="coerce")
    numerics.fillna(0, inplace=True)

    labels = df[icp_column].astype(int).tolist()

    texts_train, texts_test, techs_train, techs_test, keywords_train, keywords_test, pain_points_train, pain_points_test, numerics_train, numerics_test, y_train, y_test, df_train, df_test = train_test_split(
        texts, techs, keywords, pain_points, numerics, labels, df, test_size=0.1, random_state=42
    )

    text_vectorizer = TfidfVectorizer(max_features=1000)
    tech_mlb = MultiLabelBinarizer()
    kw_mlb = MultiLabelBinarizer()
    scaler = StandardScaler()

    X_text_train = text_vectorizer.fit_transform(texts_train)
    X_tech_train = tech_mlb.fit_transform(techs_train)
    X_kw_train = kw_mlb.fit_transform(keywords_train)
    X_num_train = scaler.fit_transform(numerics_train)

    X_train = scipy.sparse.hstack([X_text_train, X_tech_train, X_kw_train, X_num_train])

    X_text_test = text_vectorizer.transform(texts_test)
    X_tech_test = tech_mlb.transform(techs_test)
    X_kw_test = kw_mlb.transform(keywords_test)
    X_num_test = scaler.transform(numerics_test)

    X_test = scipy.sparse.hstack([X_text_test, X_tech_test, X_kw_test, X_num_test])

    clf = LogisticRegression(max_iter=2000)
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Model Accuracy on Test Set for {icp_column}: {accuracy * 100:.2f}%")

    df_test = df_test.reset_index(drop=True)
    selected_contacts = df_test[df_test[icp_column] == 1]

    contact_dicts = selected_contacts[[
        "first_name", "last_name", "title", "company", "company_phone", "city", "state", "country"
    ]].to_dict(orient="records")

    return contact_dicts

In [None]:
input_icp  = {
    "industry": ["Fleet Management", "Transport Analytics"],
    "engagement_rate": "65-85",
    "company_size_employees": "300–1200",
    "annual_revenue_usd": "20M–50M",
    "headquarters_location": "India",
    "technology_stack": ["IoT", "Java", "React", "Microservices", "PostgreSQL"],
    "target_designations": ["Product Head", "CTO", "Mobility Lead"],
    "pain_points": ["Asset tracking visibility", "Fuel monitoring", "Route planning"]
}

other={
    "industry": ["FinTech", "Banking Tech", "Payments", "Blockchain"],
    "engagement_rate": "98",
    "company_size_employees": "1600",
    "annual_revenue_usd": "90M",
    "headquarters_location": "India",
    "technology_stack": ["Blockchain", "React", "Node.js", "AWS", "Kafka", "Python", "Microservices"],
    "target_designations": ["CTO"],
    "pain_points": [
        "Transaction latency",
        "Regulatory complexity",
        "Fraud detection automation",
        "Blockchain scalability"
    ]
}

input_icp_matched=match_icp(other,icp_templates)
contacts = train_and_get_icp_contacts("noisy_icp_dataset.csv", input_icp_matched[0])
for c in contacts:
    print(c)
