In [1]:
import pandas as pd
import json
import os
import numpy as np
from typing import List, Dict, Any
import re
import asyncio

In [4]:
import pandas as pd
import json
import os
import numpy as np
from typing import List, Dict, Any
import re
import asyncio

# Load the combined dataset
file_path = "/content/Combined file.xlsx - Sheet1.json"

# Load the JSON data
with open(file_path, 'r') as f:
    combined_data = json.load(f)

# Preview the first few entries to understand the structure
print(f"Total entries: {len(combined_data)}")


Total entries: 2572


**DATA READING**

In [5]:
print("\nSample entry:")
print(json.dumps(combined_data[0], indent=2))


Sample entry:
{
  "Name": "01 Ventures",
  "Website": "https://www.01ventures.com/",
  "Global_HQ": "Netherlands",
  "Countries": "UK,Netherlands",
  "Stage": "Pre-seed, Idea, Prototype/MVP, Seed",
  "Overview": "We invest in deep tech innovations including software and hardware solutions to the world's biggest challenges.",
  "Type": "VC",
  "Industry": "Information Technology & Services",
  "Cheque_range": "$250K - $2M",
  "Linkedin_Company": "",
  "Email": "",
  "Linkedin_Personal": "",
  "Twitter": ""
}


In [6]:
print("\nSample entry:")
print(json.dumps(combined_data[200], indent=2))


Sample entry:
{
  "Name": "Cadence Growth Capital (CGC)",
  "Website": "https://www.cadencegrowthcapital.com/",
  "Global_HQ": "Germany",
  "Countries": "Germany,Austria,Switzerland",
  "Stage": "Pre-seed, Pre-IPO, Series+",
  "Overview": "We invest in:\n\u2022 Proven, tech-driven business models\n\u2022 Based within the DACH region, though open to exceptions\n\u2022 Demonstrating a distinct, imminent trajectory towards profitability, or ideally, having already attained it\n\u2022 Through minority (primarily Series+ and beyond) or majority investments\n\nOur investment themes include but are not limited to Circular Economy, Cloud Infrastructure, Compliance Tech, Cyber Security, Data, Data Analytics, Data Center Infrastructure, Data Management, Energy Transition, ERP, HCM, FinTech & InsurTech, Healthcare / HealthTech, Veterinary Services, HR Tech, Ed Tech, Mobility, PropTech, VMS, IndustrialTech, E-Commerce, MarketingTech, AI, SaaS, Marketplaces and IT Services. ",
  "Type": "PE fund",

In [8]:
print("\nSample entry:")
print(json.dumps(combined_data[2571], indent=2))


Sample entry:
{
  "Name": "Zvi Schreiber",
  "Website": "http://www.wikimediafoundation.org",
  "Global_HQ": "Hong Kong",
  "Countries": "Global",
  "Stage": "Pre-seed, Seed, Series+",
  "Overview": "",
  "Type": "Solo Angel",
  "Industry": "Sector Agnostic",
  "Cheque_range": "$1KK - 3M",
  "Linkedin_Company": "http://www.linkedin.com/in/zschreiber",
  "Email": "",
  "Linkedin_Personal": "http://www.linkedin.com/company/wikimedia-foundation",
  "Twitter": "https://twitter.com/Wikimedia"
}


# DATA CLEANING

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
import re

# Load the dataset
# Assuming you have the full JSON dataset loaded
investors_df = pd.read_json('/content/Combined file.xlsx - Sheet1.json')

# Handle missing values
# Replace empty strings with NaN for better handling
investors_df = investors_df.replace('', np.nan)

In [10]:
# Clean up 'Type' column - standardize investor types
def standardize_investor_type(investor_type):
    if pd.isna(investor_type):
        return np.nan

    # Remove newlines and strip whitespace
    cleaned = re.sub(r'\n+', '', investor_type).strip()

    # Standardize common variations
    if 'Solo angel' in cleaned or 'Solo Angel' in cleaned or 'solo Angel' in cleaned:
        return 'Solo Angel'
    elif 'Angel network' in cleaned or 'Angel Network' in cleaned:
        return 'Angel Network'
    elif 'Family office' in cleaned or 'Family Office' in cleaned:
        return 'Family Office'
    elif 'Seed fund' in cleaned:
        return 'Seed Fund'
    elif 'Multi-stage VC' in cleaned:
        return 'Multi-stage VC'
    elif 'Startup studio' in cleaned or 'Startup Studio' in cleaned:
        return 'Startup Studio'
    elif 'Incubator, Accelerator' in cleaned:
        return 'Incubator/Accelerator'
    else:
        return cleaned

investors_df['Type'] = investors_df['Type'].apply(standardize_investor_type)


In [11]:
investors_df['Type']

Unnamed: 0,Type
0,VC
1,Family Office
2,VC
3,Family Office
4,VC
...,...
2567,Solo Angel
2568,Solo Angel
2569,Solo Angel
2570,Solo Angel


In [12]:
# Parse the cheque range into min and max values
def parse_cheque_range(range_str):
    if pd.isna(range_str):
        return np.nan, np.nan

    # Extract numbers with K or M denomination
    matches = re.findall(r'\$(\d+(?:\.\d+)?)([KM])', range_str)

    if len(matches) >= 2:
        min_val = float(matches[0][0]) * (1000 if matches[0][1] == 'K' else 1000000)
        max_val = float(matches[1][0]) * (1000 if matches[1][1] == 'K' else 1000000)
        return min_val, max_val

    return np.nan, np.nan

# Create new columns for min and max investment amounts
investors_df['Min_Investment'], investors_df['Max_Investment'] = zip(*investors_df['Cheque_range'].apply(parse_cheque_range))

# Handle Countries column - convert to list
investors_df['Countries_List'] = investors_df['Countries'].apply(lambda x: x.split(',') if pd.notna(x) else [])


In [13]:
# Handle missing Overview - create a simple description based on other fields
def generate_overview(row):
    if pd.notna(row['Overview']):
        return row['Overview']

    overview = f"Investor {row['Name']} based in {row['Global_HQ']}. "
    if pd.notna(row['Industry']) and row['Industry'] != '':
        overview += f"Invests in {row['Industry']}. "
    if pd.notna(row['Stage']) and row['Stage'] != '':
        overview += f"Focuses on {row['Stage']} stage startups. "
    if pd.notna(row['Cheque_range']) and row['Cheque_range'] != '':
        overview += f"Typical investment range: {row['Cheque_range']}."

    return overview

investors_df['Processed_Overview'] = investors_df.apply(generate_overview, axis=1)

In [14]:
# Extract industries into a list
investors_df['Industry_List'] = investors_df['Industry'].apply(
    lambda x: [industry.strip() for industry in x.split(',')] if pd.notna(x) else []
)

# Extract stages into a list
investors_df['Stage_List'] = investors_df['Stage'].apply(
    lambda x: [stage.strip() for stage in x.split(',')] if pd.notna(x) else []
)

# Calculating match score

In [15]:
def calculate_match_score(founder_info, investor_df):
    """
    Calculate match scores between a founder and all investors

    Parameters:
    -----------
    founder_info : dict
        Dictionary containing founder information
    investor_df : pandas.DataFrame
        DataFrame containing investor information

    Returns:
    --------
    pandas.DataFrame
        DataFrame with investors and their match scores
    """
    # Extract founder details
    founder_industry = founder_info.get('industry', '')
    founder_stage = founder_info.get('stage', '')
    founder_funding_required = founder_info.get('funding_required', 0)
    founder_country = founder_info.get('country', '')
    founder_description = founder_info.get('description', '')

    # Create a copy of investor dataframe to add match scores
    results_df = investor_df[['Name', 'Website', 'Type', 'Global_HQ', 'Industry', 'Stage', 'Cheque_range', 'Processed_Overview']].copy()

    # Initialize scores
    results_df['Match_Score'] = 0.0

    # 1. Check industry match (30% weight)
    def calculate_industry_score(investor_industries):
        if pd.isna(investor_industries) or founder_industry == '':
            return 0.0

        investor_industries_list = [ind.strip().lower() for ind in investor_industries.split(',')]

        # Check if founder's industry is in investor's industries or if investor is sector agnostic
        if 'sector agnostic' in investor_industries_list or founder_industry.lower() in investor_industries_list:
            return 1.0
        elif any(ind in founder_industry.lower() for ind in investor_industries_list):
            return 0.7
        else:
            return 0.0

    results_df['Industry_Score'] = results_df['Industry'].apply(calculate_industry_score)

    # 2. Check stage match (20% weight)
    def calculate_stage_score(investor_stages):
        if pd.isna(investor_stages) or founder_stage == '':
            return 0.0

        investor_stages_list = [stage.strip().lower() for stage in investor_stages.split(',')]

        # Check if founder's stage is in investor's stages
        if founder_stage.lower() in investor_stages_list:
            return 1.0

        stage_proximity = {
            'idea': ['idea', 'pre-seed', 'prototype/mvp'],
            'pre-seed': ['idea', 'pre-seed', 'prototype/mvp', 'seed'],
            'prototype/mvp': ['idea', 'pre-seed', 'prototype/mvp', 'seed'],
            'seed': ['pre-seed', 'prototype/mvp', 'seed', 'series a'],
            'series a': ['seed', 'series a', 'series+'],
            'series+': ['series a', 'series+']
        }

        # Check if founder's stage is proximate to investor's stages
        if founder_stage.lower() in stage_proximity:
            proximate_stages = stage_proximity[founder_stage.lower()]
            if any(stage in proximate_stages for stage in investor_stages_list):
                return 0.7

        return 0.0

    results_df['Stage_Score'] = results_df['Stage'].apply(calculate_stage_score)

    # 3. Check funding match (25% weight)
    def calculate_funding_score(row):
        if pd.isna(row['Min_Investment']) or pd.isna(row['Max_Investment']) or founder_funding_required == 0:
            return 0.0

        # Check if funding required is within investor's range
        if row['Min_Investment'] <= founder_funding_required <= row['Max_Investment']:
            return 1.0
        elif founder_funding_required < row['Min_Investment'] and founder_funding_required > row['Min_Investment'] * 0.8:
            return 0.6  # Just below minimum
        elif founder_funding_required > row['Max_Investment'] and founder_funding_required < row['Max_Investment'] * 1.2:
            return 0.6  # Just above maximum
        else:
            return 0.0

    # Apply the funding score calculation if we have the Min_Investment and Max_Investment columns
    if 'Min_Investment' in investor_df.columns and 'Max_Investment' in investor_df.columns:
        funding_scores = []
        for _, row in results_df.iterrows():
            idx = investor_df[investor_df['Name'] == row['Name']].index[0]
            investor_row = investor_df.iloc[idx]
            funding_scores.append(calculate_funding_score(investor_row))
        results_df['Funding_Score'] = funding_scores
    else:
        results_df['Funding_Score'] = 0.0

    # 4. Check country match (10% weight)
    def calculate_country_score(investor_row):
        idx = investor_df[investor_df['Name'] == investor_row['Name']].index[0]
        countries_list = investor_df.iloc[idx]['Countries_List']

        if len(countries_list) == 0 or founder_country == '':
            return 0.0

        if founder_country in countries_list:
            return 1.0
        else:
            return 0.0

    results_df['Country_Score'] = results_df.apply(calculate_country_score, axis=1)

    # 5. Content similarity using TF-IDF (15% weight)
    if founder_description:
        # Create corpus with investor overviews and founder description
        overviews = results_df['Processed_Overview'].tolist()
        corpus = overviews + [founder_description]

        # Create TF-IDF matrix
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(corpus)

        # Calculate cosine similarity
        founder_vector = tfidf_matrix[-1]  # Last document is founder description
        investor_vectors = tfidf_matrix[:-1]  # All other documents are investor overviews

        similarity_scores = cosine_similarity(founder_vector, investor_vectors).flatten()
        results_df['Content_Similarity'] = similarity_scores
    else:
        results_df['Content_Similarity'] = 0.0

    # Calculate final weighted score
    results_df['Match_Score'] = (
        results_df['Industry_Score'] * 0.3 +
        results_df['Stage_Score'] * 0.2 +
        results_df['Funding_Score'] * 0.25 +
        results_df['Country_Score'] * 0.1 +
        results_df['Content_Similarity'] * 0.15
    )

    # Round match score to 2 decimal places
    results_df['Match_Score'] = results_df['Match_Score'].round(2)

    # Sort by match score in descending order
    results_df = results_df.sort_values('Match_Score', ascending=False)

    # Keep only relevant columns for output
    output_df = results_df[['Name', 'Website', 'Type', 'Global_HQ', 'Industry', 'Stage', 'Cheque_range', 'Match_Score']]

    return output_df

# using gemini api

In [27]:
import google.generativeai as genai
import os

# Set up the API key
os.environ['GOOGLE_API_KEY'] = 'your-gemini-api-key'
genai.configure(api_key=os.environ['GOOGLE_API_KEY'])

In [29]:
def get_gemini_enhanced_matching(founder_info, top_investors_df, model_name="gemini-1.5-flash"):
    """
    Use Gemini API to enhance match scores with deeper analysis

    Parameters:
    -----------
    founder_info : dict
        Dictionary containing founder information
    top_investors_df : pandas.DataFrame
        DataFrame containing top matching investors based on initial scoring
    model_name : str
        Name of the Gemini model to use

    Returns:
    --------
    pandas.DataFrame
        DataFrame with investors and their enhanced match scores
    """
    model = genai.GenerativeModel(model_name)

    # Create a copy of the dataframe to add enhanced scores
    enhanced_df = top_investors_df.copy()
    enhanced_df['Enhanced_Score'] = 0.0
    enhanced_df['Match_Reasoning'] = ''

    # Prepare founder information for prompt
    founder_prompt = f"""
    Startup Information:
    - Industry: {founder_info.get('industry', 'N/A')}
    - Stage: {founder_info.get('stage', 'N/A')}
    - Funding Required: ${founder_info.get('funding_required', 0):,}
    - Country: {founder_info.get('country', 'N/A')}
    - Description: {founder_info.get('description', 'N/A')}
    """

    # Process each investor
    for idx, investor in enhanced_df.iterrows():
        # Fix: Use top_investors_df instead of investor_df
        investor_prompt = f"""
        Investor Information:
        - Name: {investor['Name']}
        - Type: {investor['Type']}
        - Global HQ: {investor['Global_HQ']}
        - Countries: {investor['Countries'] if 'Countries' in investor else 'N/A'}
        - Industry Focus: {investor['Industry']}
        - Stage Preference: {investor['Stage']}
        - Cheque Range: {investor['Cheque_range']}
        - Overview: {investor['Processed_Overview'] if 'Processed_Overview' in investor else 'N/A'}
        """

        analysis_prompt = f"""
        As an AI investment matching expert, analyze the compatibility between the startup and investor based on the information provided.

        {founder_prompt}

        {investor_prompt}

        Provide:
        1. A match score between 0.0 and 1.0 (where 1.0 is perfect match)
        2. A brief explanation of why they match or don't match

        Return your response in this JSON format:
        {{
            "score": 0.XX,
            "reasoning": "Your brief explanation here"
        }}
        """

        try:
            response = model.generate_content(analysis_prompt)
            response_text = response.text

            # Extract JSON from response
            import json
            import re

            # Find JSON object in response
            json_match = re.search(r'({.*})', response_text, re.DOTALL)
            if json_match:
                json_str = json_match.group(1)
                result = json.loads(json_str)

                enhanced_df.at[idx, 'Enhanced_Score'] = float(result['score'])
                enhanced_df.at[idx, 'Match_Reasoning'] = result['reasoning']
            else:
                # Fallback to initial score if JSON parsing fails
                enhanced_df.at[idx, 'Enhanced_Score'] = investor['Match_Score']
                enhanced_df.at[idx, 'Match_Reasoning'] = "Could not generate reasoning"

        except Exception as e:
            print(f"Error processing {investor['Name']}: {str(e)}")
            enhanced_df.at[idx, 'Enhanced_Score'] = investor['Match_Score']
            enhanced_df.at[idx, 'Match_Reasoning'] = "Error in processing"

    # Calculate final score (average of initial and enhanced)
    enhanced_df['Final_Match_Score'] = ((enhanced_df['Match_Score'] + enhanced_df['Enhanced_Score']) / 2).round(2)

    # Sort by final match score
    enhanced_df = enhanced_df.sort_values('Final_Match_Score', ascending=False)

    return enhanced_df[['Name', 'Website', 'Type', 'Global_HQ', 'Industry', 'Stage', 'Cheque_range', 'Final_Match_Score', 'Match_Reasoning']]

# matching founders to the investors

In [30]:
def match_founder_with_investors(founder_info, investors_df, use_gemini=True, top_n=20):
    """
    Main function to match a founder with suitable investors

    Parameters:
    -----------
    founder_info : dict
        Dictionary containing founder information
    investors_df : pandas.DataFrame
        DataFrame containing investor information
    use_gemini : bool
        Whether to use Gemini API for enhanced matching
    top_n : int
        Number of top matches to enhance with Gemini API

    Returns:
    --------
    pandas.DataFrame
        DataFrame with ranked investors and match scores
    """
    # Step 1: Calculate initial match scores
    initial_matches = calculate_match_score(founder_info, investors_df)

    # Step 2: If using Gemini, enhance top N matches
    if use_gemini:
        top_matches = initial_matches.head(top_n)
        enhanced_matches = get_gemini_enhanced_matching(founder_info, top_matches)

        # Combine enhanced matches with remaining matches
        remaining_matches = initial_matches.iloc[top_n:].copy()
        remaining_matches['Final_Match_Score'] = remaining_matches['Match_Score']
        remaining_matches['Match_Reasoning'] = "Basic algorithm match"

        combined_matches = pd.concat([
            enhanced_matches,
            remaining_matches[enhanced_matches.columns]
        ])

        return combined_matches.sort_values('Final_Match_Score', ascending=False)

    # If not using Gemini, return initial matches
    initial_matches['Final_Match_Score'] = initial_matches['Match_Score']
    initial_matches['Match_Reasoning'] = "Basic algorithm match"

    return initial_matches[['Name', 'Website', 'Type', 'Global_HQ', 'Industry', 'Stage', 'Cheque_range', 'Final_Match_Score', 'Match_Reasoning']]

# checking example

In [31]:
# Example founder information
founder_info = {
    'industry': 'Fintech',
    'stage': 'Seed',
    'funding_required': 500000000,  # $500K
    'country': 'UK',
    'description': 'We are building a B2B payment solution for small businesses that integrates with accounting software and provides automated reconciliation and invoice processing. Our platform uses AI to predict cash flow and offer financing options.'
}

# Get matches
match_results = match_founder_with_investors(founder_info, investors_df, use_gemini=True, top_n=10)

# Display top 10 matches
print(match_results.head(10))

                            Name                               Website  \
457             Genting Ventures  https://www.gentingventures.genting/   
429                 FOV Ventures             https://www.fov.ventures/   
410               Flash Ventures                  https://www.fl4sh.vc   
336          Early Capital Group     https://www.earlycapitalgroup.com   
139                BDev Ventures             https://bdevventures.com/   
762                  Norrsken VC              https://www.norrsken.vc/   
766          Nova Growth Capital      https://novagrowthcapital.co.uk/   
836                     Planet A                 https://planet-a.com/   
1732                       Hatch                   https://haatch.com/   
593   Kapita Investment Group AB                https://kapitagroup.se   

               Type       Global_HQ  \
457    Corporate VC       Singapore   
429              VC         Finland   
410              VC         Germany   
336   Family Office       Sin

# another exmaple

In [32]:
founder_info = {
    'industry': 'HealthTech',
    'stage': 'Series A',
    'funding_required': 2000000,  # $2M
    'country': 'Germany',
    'description': 'We are developing an AI-powered telemedicine platform that connects patients with doctors in real-time. Our solution integrates with wearable devices to provide continuous health monitoring and predictive analytics for chronic disease management.'
}
# Get matches
match_results = match_founder_with_investors(founder_info, investors_df, use_gemini=True, top_n=10)

# Display top 10 matches
print(match_results.head(10))



Error processing Ventech: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).




Error processing Innovation Industries: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).




Error processing Inovo Venture Partners: 429 POST https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent?%24alt=json%3Benum-encoding%3Dint: Resource has been exhausted (e.g. check quota).
                            Name                                Website  \
67    Alpha Intelligence Capital                  https://aicapital.ai/   
408        Five Seasons Ventures            https://www.fiveseasons.vc/   
505         Hirschvogel Ventures      https://www.hirschvogel.ventures/   
41                Adelie Capital                     https://adelie.vc/   
170            Blue Wire Capital       https://www.bluewirecapital.com/   
1062           Swisscom Ventures         https://ventures.swisscom.com/   
430           Framework Ventures            https://framework.ventures/   
544        Innovation Industries  https://www.innovationindustries.com/   
1155                     Ventech             https://www.ventechvc.com/   
444        Fuse Venture Part

# evaluating

In [33]:
def evaluate_matching_system(test_cases, investors_df):
    """
    Evaluate the matching system using test cases with known good matches

    Parameters:
    -----------
    test_cases : list of dicts
        List of test cases, each containing founder_info and known_good_matches
    investors_df : pandas.DataFrame
        DataFrame containing investor information

    Returns:
    --------
    dict
        Evaluation metrics
    """
    metrics = {
        'precision_at_5': [],
        'precision_at_10': [],
        'recall_at_20': []
    }

    for case in test_cases:
        founder_info = case['founder_info']
        known_good_matches = case['known_good_matches']

        # Get match results
        match_results = match_founder_with_investors(founder_info, investors_df, use_gemini=False)

        # Calculate precision@5
        top_5 = match_results.head(5)['Name'].tolist()
        precision_5 = len(set(top_5) & set(known_good_matches)) / 5
        metrics['precision_at_5'].append(precision_5)

        # Calculate precision@10
        top_10 = match_results.head(10)['Name'].tolist()
        precision_10 = len(set(top_10) & set(known_good_matches)) / 10
        metrics['precision_at_10'].append(precision_10)

        # Calculate recall@20
        top_20 = match_results.head(20)['Name'].tolist()
        recall_20 = len(set(top_20) & set(known_good_matches)) / len(known_good_matches)
        metrics['recall_at_20'].append(recall_20)

    # Calculate average metrics
    for metric in metrics:
        metrics[metric] = sum(metrics[metric]) / len(metrics[metric])

    return metrics

# running for new data

In [34]:
def run_founder_investor_matching_system():
    """Run the complete founder-investor matching system"""
    # Load data
    investors_df = pd.read_json('/content/Combined file.xlsx - Sheet1.json')

    # Preprocess the data
    # Replace empty strings with NaN for better handling
    investors_df = investors_df.replace('', np.nan)

    # Clean up 'Type' column - standardize investor types
    def standardize_investor_type(investor_type):
        if pd.isna(investor_type):
            return np.nan

        # Remove newlines and strip whitespace
        cleaned = re.sub(r'\n+', '', investor_type).strip()

        # Standardize common variations
        if 'Solo angel' in cleaned or 'Solo Angel' in cleaned or 'solo Angel' in cleaned:
            return 'Solo Angel'
        elif 'Angel network' in cleaned or 'Angel Network' in cleaned:
            return 'Angel Network'
        elif 'Family office' in cleaned or 'Family Office' in cleaned:
            return 'Family Office'
        elif 'Seed fund' in cleaned:
            return 'Seed Fund'
        elif 'Multi-stage VC' in cleaned:
            return 'Multi-stage VC'
        elif 'Startup studio' in cleaned or 'Startup Studio' in cleaned:
            return 'Startup Studio'
        elif 'Incubator, Accelerator' in cleaned:
            return 'Incubator/Accelerator'
        else:
            return cleaned

    investors_df['Type'] = investors_df['Type'].apply(standardize_investor_type)

    # Parse the cheque range into min and max values
    def parse_cheque_range(range_str):
        if pd.isna(range_str):
            return np.nan, np.nan

        # Extract numbers with K or M denomination
        matches = re.findall(r'\$(\d+(?:\.\d+)?)([KM])', range_str)

        if len(matches) >= 2:
            min_val = float(matches[0][0]) * (1000 if matches[0][1] == 'K' else 1000000)
            max_val = float(matches[1][0]) * (1000 if matches[1][1] == 'K' else 1000000)
            return min_val, max_val

        return np.nan, np.nan

    # Create new columns for min and max investment amounts
    investors_df['Min_Investment'], investors_df['Max_Investment'] = zip(*investors_df['Cheque_range'].apply(parse_cheque_range))

    # Handle Countries column - convert to list
    investors_df['Countries_List'] = investors_df['Countries'].apply(lambda x: x.split(',') if pd.notna(x) else [])

    # Handle missing Overview - create a simple description based on other fields
    def generate_overview(row):
        if pd.notna(row['Overview']):
            return row['Overview']

        overview = f"Investor {row['Name']} based in {row['Global_HQ']}. "
        if pd.notna(row['Industry']) and row['Industry'] != '':
            overview += f"Invests in {row['Industry']}. "
        if pd.notna(row['Stage']) and row['Stage'] != '':
            overview += f"Focuses on {row['Stage']} stage startups. "
        if pd.notna(row['Cheque_range']) and row['Cheque_range'] != '':
            overview += f"Typical investment range: {row['Cheque_range']}."

        return overview

    investors_df['Processed_Overview'] = investors_df.apply(generate_overview, axis=1)

    # Extract industries into a list
    investors_df['Industry_List'] = investors_df['Industry'].apply(
        lambda x: [industry.strip() for industry in x.split(',')] if pd.notna(x) else []
    )

    # Extract stages into a list
    investors_df['Stage_List'] = investors_df['Stage'].apply(
        lambda x: [stage.strip() for stage in x.split(',')] if pd.notna(x) else []
    )

    # Get founder input
    print("Please enter your startup information:")
    industry = input("Industry: ")
    stage = input("Stage (Idea, Pre-seed, Prototype/MVP, Seed, Series A, Series+): ")

    funding_required_str = input("Funding required (e.g., 500K, 2M): ")
    funding_required = 0
    # Convert to lowercase for easier parsing
    funding_str_lower = funding_required_str.lower()
    if 'k' in funding_str_lower:
        funding_required = float(funding_str_lower.replace('k', '')) * 1000
    elif 'm' in funding_str_lower:
        funding_required = float(funding_str_lower.replace('m', '')) * 1000000
    else:
        # Try to convert directly to a number
        try:
            funding_required = float(funding_required_str)
        except ValueError:
            print("Warning: Could not parse funding amount. Using 0 as default.")
    country = input("Country: ")
    description = input("Brief description of your startup: ")

    founder_info = {
        'industry': industry,
        'stage': stage,
        'funding_required': funding_required,
        'country': country,
        'description': description
    }

    # Get matches
    use_gemini = input("Use Gemini API for enhanced matching? (y/n): ").lower() == 'y'

    match_results = match_founder_with_investors(
        founder_info,
        investors_df,
        use_gemini=use_gemini
    )

    # Display results
    print("\nTop Matching Investors:")
    for i, (_, investor) in enumerate(match_results.head(10).iterrows()):
        print(f"\n{i+1}. {investor['Name']} (Match Score: {investor['Final_Match_Score']})")
        print(f"   Website: {investor['Website']}")
        print(f"   Type: {investor['Type']}")
        print(f"   HQ: {investor['Global_HQ']}")
        print(f"   Industry Focus: {investor['Industry']}")
        print(f"   Stage: {investor['Stage']}")
        print(f"   Investment Range: {investor['Cheque_range']}")
        print(f"   Reasoning: {investor['Match_Reasoning']}")

    # Save results to CSV
    save_results = input("\nSave results to CSV? (y/n): ").lower() == 'y'
    if save_results:
        filename = input("Enter filename (default: investor_matches.csv): ") or "investor_matches.csv"
        match_results.head(10).to_csv(filename, index=False)
        print(f"Results saved to {filename}")

In [35]:
run_founder_investor_matching_system()

Please enter your startup information:
Industry: medicine
Stage (Idea, Pre-seed, Prototype/MVP, Seed, Series A, Series+): idea
Funding required (e.g., 500K, 2M): 2m
Country: pak
Brief description of your startup: its a startup to help ppl with cancer
Use Gemini API for enhanced matching? (y/n): y

Top Matching Investors:

1. Go Capital (Match Score: 0.78)
   Website: https://www.gocapital.fr/en/
   Type: VC
   HQ: France
   Industry Focus: Sector Agnostic
   Stage: Pre-seed, Idea, Prototype/MVP, Seed, Series+
   Investment Range: $300K - $5M
   Reasoning: Go Capital is a VC firm with a sector-agnostic approach and invests in stages ranging from pre-seed to Series+,  perfectly encompassing the startup's idea stage.  Their cheque range of $300K-$5M comfortably covers the startup's $2M funding need.  The geographical location of the startup in Pakistan may present some logistical challenges, but it shouldn't be a complete deterrent given Go Capital's lack of specified regional limitations