In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data_raw/top_100_saas_companies_2025.csv')

df.head()

Unnamed: 0,Company Name,Founded Year,HQ,Industry,Total Funding,ARR,Valuation,Employees,Top Investors,Product,G2 Rating
0,Microsoft,1975,"Redmond, WA, USA",Enterprise Software,$1B,$270B,$3T,221000,"Bill Gates, Paul Allen","Azure, Office 365, Teams",4.4
1,Salesforce,1999,"San Francisco, CA, USA",CRM,$65.4M,$37.9B,$227.8B,75000,"Halsey Minor, Larry Ellison","Sales Cloud, Service Cloud",4.3
2,Adobe,1982,"San Jose, CA, USA",Creative Software,$2.5M,$19.4B,$240B,29945,Hambrecht & Quist,"Creative Cloud, Document Cloud",4.5
3,Oracle,1977,"Austin, TX, USA",Database & Enterprise,$2K,$52.9B,$350B,143000,"Larry Ellison, Bob Miner","Oracle Cloud, NetSuite",4.0
4,SAP,1972,"Walldorf, Germany",Enterprise Software,,$32.5B,$215B,107415,"Dietmar Hopp, Klaus Tschira","S/4HANA, SuccessFactors",4.1


In [3]:
# Check data type
df.dtypes

Company Name      object
Founded Year       int64
HQ                object
Industry          object
Total Funding     object
ARR               object
Valuation         object
Employees         object
Top Investors     object
Product           object
G2 Rating        float64
dtype: object

In [4]:
# Parsing the all column that contains currency values
import numpy as np

def parse_currency(value):
    if pd.isnull(value):
        return np.nan
    value = value.replace('$', '').replace(',', '').strip()
    multipliers = {'K' : 1e3, 'M' : 1e6, 'B' : 1e9, 'T' : 1e12}
    if value[-1] in multipliers:
        return float(value[:-1]) * multipliers[value[-1]]
    try:
        return float(value)
    except ValueError:
        return np.nan

# Apply the function   
df['Total Funding'] = df['Total Funding'].apply(parse_currency)
df['ARR'] = df['ARR'].apply(parse_currency)
df['Valuation'] = df['Valuation'].apply(parse_currency)

# Parse the Employee
df['Employees'] = df['Employees'].str.replace(',', '').astype(float)

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [6]:
print("Data Shape: ", df.shape)
df.head(10)

Data Shape:  (87, 11)


Unnamed: 0,Company Name,Founded Year,HQ,Industry,Total Funding,ARR,Valuation,Employees,Top Investors,Product,G2 Rating
0,Microsoft,1975,"Redmond, WA, USA",Enterprise Software,1000000000.0,270000000000.0,3000000000000.0,221000.0,"Bill Gates, Paul Allen","Azure, Office 365, Teams",4.4
1,Salesforce,1999,"San Francisco, CA, USA",CRM,65400000.0,37900000000.0,227800000000.0,75000.0,"Halsey Minor, Larry Ellison","Sales Cloud, Service Cloud",4.3
2,Adobe,1982,"San Jose, CA, USA",Creative Software,2500000.0,19400000000.0,240000000000.0,29945.0,Hambrecht & Quist,"Creative Cloud, Document Cloud",4.5
3,Oracle,1977,"Austin, TX, USA",Database & Enterprise,2000.0,52900000000.0,350000000000.0,143000.0,"Larry Ellison, Bob Miner","Oracle Cloud, NetSuite",4.0
5,Intuit,1983,"Mountain View, CA, USA",Financial Software,273000000.0,14400000000.0,180000000000.0,18200.0,"Sierra Ventures, Kleiner Perkins","QuickBooks, TurboTax",4.4
6,ServiceNow,2004,"Santa Clara, CA, USA",IT Service Management,82500000.0,8900000000.0,147000000000.0,20000.0,"JMI Equity, Sequoia Capital",IT Service Management Platform,4.4
7,Workday,2005,"Pleasanton, CA, USA",HR & Finance,249900000.0,7300000000.0,65000000000.0,18800.0,"Greylock Partners, NEA","HCM, Financial Management",4.2
8,Zoom,2011,"San Jose, CA, USA",Video Communications,145500000.0,4500000000.0,85000000000.0,7388.0,"Sequoia Capital, Emergence",Video Conferencing Platform,4.5
9,Shopify,2006,"Ottawa, Canada",E-commerce,122300000.0,7100000000.0,95000000000.0,11600.0,"Bessemer, FirstMark, Felicis",E-commerce Platform,4.4
10,Atlassian,2002,"Sydney, Australia",Collaboration Software,60000000.0,3500000000.0,55000000000.0,11800.0,Accel Partners,"Jira, Confluence, Trello",4.3


In [8]:
# Save the file to data_cleaned folder
df.to_csv('../data_cleaned/top_100_saas_companies_2025_cleaned.csv', index=False)