<a href="https://colab.research.google.com/github/devikaajay/Startup_Success_Prediction/blob/main/startup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.impute import SimpleImputer
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

In [95]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

WIKI_URL = 'https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue'

def clean_value(text):

    if not text:
        return 0.0

    text = text.strip()

    text = re.sub(r'\[.*?\]', '', text)

    is_negative = False
    if '(' in text and ')' in text:
        is_negative = True
        text = text.replace('(', '').replace(')', '')


    cleaned_text = re.sub(r'[$,\s]', '', text)

    try:
        value = float(cleaned_text)
        return -value if is_negative else value
    except ValueError:

        return 0.0

def scrape_wikipedia_table(url):

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching URL: {e}")
        return pd.DataFrame()

    soup = BeautifulSoup(response.content, 'html.parser')

    try:
        tables = soup.find_all('table', {'class': 'wikitable'})
        table = tables[0]
    except (IndexError, AttributeError):
        print("Could not find the target table on the page.")
        return pd.DataFrame()

    data = []
    rows = table.find_all('tr')

    last_industry = ""
    last_headquarters = ""

    for row in rows[1:]:
        cols = row.find_all(['td', 'th'])

        if len(cols) < 5:
            continue

        current_industry = last_industry
        current_headquarters = last_headquarters

        row_cells = [cell for cell in cols]
        num_cells = len(row_cells)

        try:
            rank = int(clean_value(row_cells[0].text))

            name = row_cells[1].text.strip()

            if num_cells > 2 and row_cells[2].find('a'):
                industry_cell = row_cells.pop(2)
                current_industry = industry_cell.text.strip()
                last_industry = current_industry
            revenue_usd_m = clean_value(row_cells[2].text)
            profit_usd_m = clean_value(row_cells[3].text)
            employee_count = int(clean_value(row_cells[4].text))
            headquarters_index = 5

            if len(row_cells) > headquarters_index:
                headquarters_cell = row_cells[headquarters_index]
                headquarters_text = headquarters_cell.text

                sanitized_headquarters = re.sub(r'\[.*?\]', '', headquarters_text).strip()

                if sanitized_headquarters:
                    # Headquarters cell is present and has a new value
                    current_headquarters = sanitized_headquarters
                    last_headquarters = current_headquarters
                else:

                    current_headquarters = last_headquarters
            else:
                # Headquarters cell is missing (merged/rowspan) from the HTML, so use last state
                current_headquarters = last_headquarters

            data.append({
                'Rank': rank,
                'Company Name': name,
                'Industry': current_industry,
                'Revenue (M USD)': revenue_usd_m,
                'Profit (M USD)': profit_usd_m,
                'Employee Count': employee_count,
                'Headquarters': current_headquarters
            })

        except Exception as e:

            continue

    return pd.DataFrame(data)

# Scrape the data
df_companies = scrape_wikipedia_table(WIKI_URL)

if not df_companies.empty:
    df_companies.to_csv('companies.csv', index=False)
    print("Scraping complete. Data saved to companies.csv")

else:
    print("No data was scraped.")

# Adding a simple success metric based on Revenue (just for display purposes)
if not df_companies.empty:
    df_companies['Success_Metric'] = df_companies['Revenue (M USD)'].apply(lambda x: 1 if x > 400000 else 0)
    df_companies.to_csv('companies.csv', index=False)


Scraping complete. Data saved to companies.csv


In [96]:
  print("\nFirst 5 rows of the scraped data (for verification):")
  df_companies.head()


First 5 rows of the scraped data (for verification):


Unnamed: 0,Rank,Company Name,Industry,Revenue (M USD),Profit (M USD),Employee Count,Headquarters,Success_Metric
0,1,Walmart,Retail,680985.0,19436.0,2100000,United States,1
1,2,Amazon,Retail information technology,637959.0,59248.0,1556000,United States,1
2,3,State Grid Corporation of China,Electricity,545948.0,9204.0,1361423,China,1
3,4,Saudi Aramco,Oil and gas,480446.0,106246.0,73311,Saudi Arabia,1
4,5,China Petrochemical Corporation,Oil and gas,429700.0,9393.0,513434,China,1


In [97]:
df_companies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Rank             49 non-null     int64  
 1   Company Name     49 non-null     object 
 2   Industry         49 non-null     object 
 3   Revenue (M USD)  49 non-null     float64
 4   Profit (M USD)   49 non-null     float64
 5   Employee Count   49 non-null     int64  
 6   Headquarters     49 non-null     object 
 7   Success_Metric   49 non-null     int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 3.2+ KB


In [99]:
df_companies.shape

(49, 8)