# FranServe Data

_Production Code: ./src/data/franserve/\*.py_


In [1]:
import os
import requests
import dotenv

from bs4 import BeautifulSoup

We need to log into an account to have access to the broker's data.


In [2]:
dotenv.load_dotenv()

username = os.getenv("FRANSERVE_EMAIL")
password = os.getenv("FRANSERVE_PASSWORD")

LOGIN_URL = "https://franservesupport.com/Default.asp"
PROTECTED_URL = "https://franservesupport.com/directory.asp?ClientID="

session = requests.Session()

In [3]:

# 1) Fetch the login page to grab any hidden form tokens (CSRF, etc.)
resp = session.get(LOGIN_URL)
soup = BeautifulSoup(resp.text, "html.parser")

# 2) Post your credentials to log in
LOGIN_ACTION = "https://franservesupport.com/process_login.asp"


payload = {
    "email": username,
    "password": password,
    "Submit": "Login",
}
login_resp = session.post(LOGIN_ACTION, data=payload)
login_resp.raise_for_status()  # ensure login succeeded

print("After login, URL is:", login_resp.url)
print("Redirect history:", [r.status_code for r in login_resp.history])

# 3) Now session holds the auth cookies—fetch protected content
protected = session.get(PROTECTED_URL)
protected.raise_for_status()
print("Protected page title:", BeautifulSoup(protected.text, "html.parser").title.string)

# Expected output:
#
# After login, URL is: https://franservesupport.com/main.asp?login=1
# Redirect history: [302]
# Protected page title: FranServe Franchise Portfolio

After login, URL is: https://franservesupport.com/main.asp?login=1
Redirect history: [302]
Protected page title: FranServe Franchise Portfolio


The catalogues where we scrap the franchises' data:


In [4]:
# base_url: https://franservesupport.com/directory.asp?ClientID=
# next_url: https://franservesupport.com/directory.asp?ClientID=&offset=50
# next_url: https://franservesupport.com/directory.asp?ClientID=&offset=100
# ...
# last_url: https://franservesupport.com/directory.asp?ClientID=&offset=800

In [5]:
BASE_URL = "https://franservesupport.com/"
CATALOGUE_BASE_URL = BASE_URL + "directory.asp?ClientID="


def get_franchise_url(session: requests.Session, base_url: str, catalogue_url: str) -> str:
    resp = session.get(catalogue_url)
    soup = BeautifulSoup(resp.text, "html.parser")

    matching_links = [
        base_url + a['href']
        for a in soup.find_all("a", href=True)
        if a['href'].startswith("franchisedetails")
    ]
     
    # We ignore the first link as it belongs to the ad at the top of the page
    return matching_links[1:]


def get_franchise_url_list(session: requests.Session, base_url: str, catalogue_base_url: str, offset_max: int = 800, offset_step: int = 50) -> list[str]:
    """
    Get a list of URLs to scrape.

    Args:
        session (requests.Session): The session object to use.
        base_url (str): The base URL to scrape.
        catalogue_base_url (str): The base URL to scrape.
        offset_max (int): The maximum offset to scrape.
        offset_step (int): The step size for the offset.

    Returns:
        list[str]: A list of URLs to scrape.
    """
    catalogue_urls = [f"{catalogue_base_url}&offset={i}" for i in range(0, offset_max, offset_step)]
    franchise_urls = []
    for catalogue_url in catalogue_urls:
        franchise_urls.extend(get_franchise_url(session, base_url, catalogue_url))
        break
    return franchise_urls


def get_franchise_data(session: requests.Session, url: str) -> dict:
    resp = session.get(url)
    soup = BeautifulSoup(resp.text, "html.parser")

    td = soup.find_all("td", attrs={"colspan": "2"})[1]

    return td

In [6]:
res = get_franchise_data(session, "https://franservesupport.com/franchisedetails.asp?FranID=4017&ClientID=")

In [7]:
res

<td colspan="2"><div border="0" class="MainFont" style="margin: 30px 15px 15px 15px; box-sizing: border-box;" width="100%">
<div style="width: 32%; float: left;">
<b><font size="+1">Balloon Kings</font></b>  
                  <img class="icon" id="Favorite" onclick="toggleIcon('Favorite', 'images/star-outline.png', 'images/star-filled.png');" src="images/star-outline.png" title="Add to Favorites"/>
<img class="icon" id="Liked" onclick="toggleIcon('Liked', 'images/heart-outline.png', 'images/heart-filled.png');" src="images/heart-outline.png" title="Add to Liked"/>
</div>
<div style="width: 32%; float: left; margin-left: 10px;">
<input class="btn_grey" id="error_btn" onclick="showErrorForm();" type="button" value="Report a Page Correction"/>
<div id="error_form" style="display: none;">
<form>
<textarea id="message" name="" placeholder="Type the error. Please be specific. For example, wrong Franchise Fee in the Additional Details section? Wrong phone number or email? Wrong contact perso

In [12]:
def slugify(text: str) -> str:
    """Converts a string into a URL-friendly slug."""
    text = text.lower().strip()
    text = re.sub(r"[\s/()]+", "_", text)
    text = re.sub(r"[^a-z0-9_]", "", text)
    text = text.strip("_")
    return text

text = "aaa-aaa"
slugify(text)

'aaaaaa'

In [8]:
import json
import re

def parse_franchise_html(soup):
    """
    Parses the HTML content of a FranServe franchise page and extracts the data.

    Args:
        soup: The HTML content of the franchise page as a string.

    Returns:
        A dictionary containing the extracted franchise and contact information.
    """

    def get_text_or_none(element):
        """Returns the text of an element or None if the element doesn't exist."""
        return element.get_text(strip=True) if element else None

    def clean_financial_value(text):
        """Removes currency symbols and commas and converts to integer."""
        if not text:
            return None
        return int(re.sub(r'[$,]', '', text.split()[0]))

    # --- Franchise Information ---
    franchise_data = {}

    franchise_name_tag = soup.find('b').find('font', size='+1')
    franchise_data['franchise_name'] = get_text_or_none(franchise_name_tag)

    fran_id_tag = soup.find('input', {'name': 'ZorID'})
    franchise_data['source_id'] = int(fran_id_tag['value']) if fran_id_tag else None
    
    franchise_data['source_url'] = f"https://www.franserve.com/franchise.asp?id={franchise_data['source_id']}" if franchise_data['source_id'] else None

    category_links = soup.select('div.col-left > div > b:contains("Category:") ~ a')
    franchise_data['primary_category'] = get_text_or_none(category_links[0]) if category_links else None

    sub_category_links = soup.select('div.col-left > div > b:contains("Subcategory:") ~ a')
    franchise_data['sub_categories'] = json.dumps([get_text_or_none(link) for link in sub_category_links]) if sub_category_links else None

    left_col_text = soup.select_one('div.col-left > div').get_text()
    
    corporate_address_match = re.search(r'Corporate Office:\s*(.*?)(?=\s*Contact:)', left_col_text, re.DOTALL)
    franchise_data['corporate_address'] = corporate_address_match.group(1).strip() if corporate_address_match else None

    website_tag = soup.find('a', href=re.compile(r'www\..*'))
    franchise_data['website_url'] = website_tag['href'] if website_tag else None
    
    right_col_top_div = soup.select_one('div.col-left > div:nth-of-type(2)')
    if right_col_top_div:
        right_col_top_text = right_col_top_div.get_text()
        franchise_data['franchise_fee_usd'] = clean_financial_value(re.search(r'Franchise Fee:\s*\$([\d,]+)', right_col_top_text).group(1) if re.search(r'Franchise Fee:\s*\$([\d,]+)', right_col_top_text) else None)
        franchise_data['required_cash_investment_usd'] = clean_financial_value(re.search(r'Cash Investment:\s*\$([\d,]+)', right_col_top_text).group(1) if re.search(r'Cash Investment:\s*\$([\d,]+)', right_col_top_text) else None)
        
        total_investment_match = re.search(r'Total Investment:\s*\$([\d,]+)\s*-\s*\$([\d,]+)', right_col_top_text)
        if total_investment_match:
            franchise_data['total_investment_min_usd'] = clean_financial_value(total_investment_match.group(1))
            franchise_data['total_investment_max_usd'] = clean_financial_value(total_investment_match.group(2))
        
        franchise_data['required_net_worth_usd'] = clean_financial_value(re.search(r'NetWorth:\s*\$([\d,]+)', right_col_top_text).group(1) if re.search(r'NetWorth:\s*\$([\d,]+)', right_col_top_text) else None)
        franchise_data['royalty_details_text'] = re.search(r'Royalties:\s*(.+)', right_col_top_text).group(1).strip() if re.search(r'Royalties:\s*(.+)', right_col_top_text) else None
        
        sba_approved_text = re.search(r'SBA approved:\s*(.*)', right_col_top_text)
        franchise_data['sba_approved'] = 'Yes' in sba_approved_text.group(1) if sba_approved_text and sba_approved_text.group(1).strip() else False
        
        vetfran_text = re.search(r'VetFran:\s*(.*)', right_col_top_text)
        franchise_data['vetfran_member'] = 'Yes' in vetfran_text.group(1) if vetfran_text else False
        
        master_franchise_text = re.search(r'Master Franchise / Area Developer Opportunity:\s*(.*)', right_col_top_text)
        franchise_data['master_franchise_opportunity'] = 'Yes' in master_franchise_text.group(1) if master_franchise_text else False
        
        founded_match = re.search(r'Founded:\s*(\d{4})', right_col_top_text)
        franchise_data['founded_year'] = int(founded_match.group(1)) if founded_match else None
        
        franchised_match = re.search(r'Franchised:\s*(\d{4})', right_col_top_text)
        franchise_data['franchised_year'] = int(franchised_match.group(1)) if franchised_match else None

    additional_details_heading = soup.find('h2', string='Additional Details')
    description_paragraphs = []
    if additional_details_heading:
        for sibling in additional_details_heading.find_next_siblings():
            if sibling.name == 'table':
                break
            if sibling.name == 'p':
                description_paragraphs.append(get_text_or_none(sibling))
    franchise_data['description_text'] = '\n'.join(description_paragraphs).strip()

    why_franchise_list = soup.select('p:contains("WHY") + ul li')
    franchise_data['why_franchise_summary'] = json.dumps([get_text_or_none(li) for li in why_franchise_list]) if why_franchise_list else None

    ideal_candidate_list = soup.select('p:contains("IDEAL FRANCHISEE") + ul li')
    franchise_data['ideal_candidate_profile_text'] = json.dumps([get_text_or_none(li) for li in ideal_candidate_list]) if ideal_candidate_list else None
    
    background_section_text = soup.select_one('td:contains("BACKGROUND")').get_text() if soup.select_one('td:contains("BACKGROUND")') else ''
    
    home_based_match = re.search(r'Home Based:\s*(Yes|No)', background_section_text)
    franchise_data['is_home_based'] = home_based_match.group(1) == 'Yes' if home_based_match else None
    
    semi_absentee_match = re.search(r'Semi-Absentee ownership available:\s*(Yes|No)', background_section_text)
    franchise_data['allows_semi_absentee'] = semi_absentee_match.group(1) == 'Yes' if semi_absentee_match else None
    
    absentee_match = re.search(r'Absentee ownership available:\s*(Yes|No)', background_section_text)
    franchise_data['allows_absentee'] = absentee_match.group(1) == 'Yes' if absentee_match else None
    
    e2_visa_match = re.search(r'E2 Visa Friendly:\s*(Yes|No)', background_section_text)
    franchise_data['e2_visa_friendly'] = e2_visa_match.group(1) == 'Yes' if e2_visa_match else None
    
    unavailable_states_match = re.search(r'NOT available:\s*(.+)', background_section_text)
    if unavailable_states_match:
        states_text = unavailable_states_match.group(1).strip()
        unavailable_states = [state.strip() for state in states_text.split(', ')]
        franchise_data['unavailable_states'] = json.dumps(unavailable_states)
    else:
        franchise_data['unavailable_states'] = None

    num_franchises_match = re.search(r'Number of franchises currently operating:\s*(\d+)', background_section_text)
    num_international_match = re.search(r'Number of International franchises currently operating:\s*(\d+)', background_section_text)
    num_corporate_match = re.search(r'Number of corporate owned franchises:\s*(\d+)', background_section_text)

    franchise_data['locations'] = json.dumps({
        'operating_franchises': int(num_franchises_match.group(1)) if num_franchises_match else 0,
        'international_franchises': int(num_international_match.group(1)) if num_international_match else 0,
        'corporate_owned': int(num_corporate_match.group(1)) if num_corporate_match else 0
    })

    last_updated_tag = soup.find('i', string=re.compile(r'Last updated:'))
    franchise_data['last_updated_from_source'] = get_text_or_none(last_updated_tag).replace('Last updated: ', '').strip() if last_updated_tag else None

    vetfran_discount_details_match = re.search(r'Veterans/Minorities/First Responders Discount\?\s*(Yes|No)', background_section_text)
    if vetfran_discount_details_match and vetfran_discount_details_match.group(1) == 'Yes':
        franchise_data['vetfran_discount_details'] = 'Discount available for Veterans/Minorities/First Responders.'
    else:
        franchise_data['vetfran_discount_details'] = None

    # --- Contacts Information ---
    contacts_data = []
    
    # Primary Contact
    primary_contact = {}
    contact_name_tag = soup.find('b', string='Contact:')
    if contact_name_tag:
        primary_contact['name'] = contact_name_tag.next_sibling.strip()
        primary_contact['phone'] = get_text_or_none(contact_name_tag.find_next('b', string='Phone:').next_sibling)
        email_tag = contact_name_tag.find_next('a', href=lambda href: href and 'mailto:' in href)
        primary_contact['email'] = get_text_or_none(email_tag)
        primary_contact['is_primary'] = True
        primary_contact['notes'] = None
        contacts_data.append(primary_contact)

    # Alternative Contact
    alt_contact = {}
    alt_contact_name_tag = soup.find('b', string='Alternative Contact:')
    if alt_contact_name_tag:
        alt_contact['name'] = alt_contact_name_tag.next_sibling.strip()
        phone_after_alt = alt_contact_name_tag.find_next('b', string='Phone:')
        alt_contact['phone'] = get_text_or_none(phone_after_alt.next_sibling)
        email_after_alt = alt_contact_name_tag.find_next('a', href=lambda href: href and 'mailto:' in href)
        alt_contact['email'] = get_text_or_none(email_after_alt)
        alt_contact['is_primary'] = False
        alt_contact['notes'] = 'Alternative Contact'
        contacts_data.append(alt_contact)
        
    return {
        'franchise_data': franchise_data,
        'contacts_data': contacts_data
    }

parse_franchise_html(res)



{'franchise_data': {'franchise_name': 'Balloon Kings',
  'source_id': 4017,
  'source_url': 'https://www.franserve.com/franchise.asp?id=4017',
  'primary_category': 'Events & Entertainment',
  'sub_categories': '["Event Services", "Party Supplies", "mariddiough@gmail.com", "Mariel@thefranchiseadvisor.com", "www.balloonkingsfranchise.com/", "", "", "", "calendly.com/king-gene"]',
  'corporate_address': '5512 Carmel Road  Apt. 103Charlotte, NC\xa028226',
  'website_url': 'http://www.balloonkingsfranchise.com/',
  'franchise_fee_usd': 45500,
  'required_cash_investment_usd': 75000,
  'total_investment_min_usd': 165000,
  'total_investment_max_usd': 165000,
  'required_net_worth_usd': 250000,
  'royalty_details_text': '9%Cash Investment: $75,000Total Investment: $165,000 - $165,000NetWorth: $250,000VetFran: No SBA approved: Master Franchise / Area Developer Opportunity: NoFounded: 2011Franchised: 2019',
  'sba_approved': False,
  'vetfran_member': False,
  'master_franchise_opportunity': F