In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter


In [2]:
# Define the Wikipedia URL containing the list of largest US companies
url = 'https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue'

page = requests.get(url)  # Send a GET request to the webpage

soup = BeautifulSoup(page.text, 'html')  # Parse the HTML content using BeautifulSoup

In [None]:
print(soup)

In [4]:
# Find the relevant table containing the data
table = soup.find('table',class_='wikitable sortable')

In [None]:
# Extract table headers (column names)
world_titles = table.find_all('th')
world_table_titles = [title.text.strip() for title in world_titles]
print(world_table_titles)

In [6]:
# Create an empty DataFrame with the extracted column names
df = pd.DataFrame(columns = world_table_titles)

In [7]:
# Extract table rows and populate the DataFrame
column_data = table.find_all('tr')
for row in column_data[1:]:
    row_data = row.find_all('td')
    individual_row_data = [data.text.strip() for data in row_data]
    
    length = len(df)
    df.loc[length] = individual_row_data

In [None]:
df

In [9]:
# Save the DataFrame to a CSV file
df.to_csv(r'C:\Users\dhawa\OneDrive\Desktop\project\Largest Companies Analysis\wikidata.csv',index=False)

In [None]:
# Read and inspect the CSV file
df.info()
df.describe()

In [11]:
# Create a copy to avoid modifying the original DataFrame
df1 = df

# Convert revenue to string, remove commas, then back to Int64
df1['Revenue (USD millions)'] = df1['Revenue (USD millions)'].astype(str).str.replace(",", "").astype('Int64')

# Convert 'Industry' to categorical
df1['Industry'] = df1['Industry'].astype('category')

# Convert revenue growth to float after removing '%'
df1["Revenue growth"] = df1["Revenue growth"].astype(str).str.replace("%", "").astype(float)

# Convert employees count to integer after removing commas
df1["Employees"] = df1["Employees"].astype(str).str.replace(",", "").astype('Int64')

# Split 'Headquarters' into 'City' and 'State' safely
df1[["City", "State"]] = df1["Headquarters"].astype(str).str.split(",",n=1, expand=True)

# Strip spaces and convert to category
df1["City"] = df1["City"].str.strip().astype("category")
df1["State"] = df1["State"].str.strip().astype("category")

# Drop unnecessary columns
df1 = df1.drop(columns=['Headquarters','Rank'])


In [None]:
df1.info()
df1.describe()

In [None]:
# Select the top 10 companies by revenue
df_top10 = df1.nlargest(10, 'Revenue (USD millions)')

# Define a custom color palette
custom_palette = sns.color_palette("husl", len(df_top10))

# Create a bar plot for top 10 companies by revenue
plt.figure(figsize=(19, 7)) 
ax = sns.barplot(x='Name' ,y='Revenue (USD millions)' ,data=df_top10 ,palette= custom_palette ,hue ='Name' , legend=False)

# Annotate bars with values
for p in ax.patches:
    ax.text(p.get_x() + p.get_width() / 2,   # X-coordinate (center of bar)
            p.get_height()-5,                # Y-coordinate (slightly below the top)
            f'{int(p.get_height())}',        # Text (value)
            ha='center',                     # Horizontal alignment
            va='baseline',                   # Vertical alignment
            fontsize=12, 
            color='black',                   # White text for contrast
            fontweight='bold')

# Labels and title
plt.xlabel("Companies")
plt.ylabel("Revenue (USD Millions)")
plt.title("Top 10 Companies - Revenue(USD Millions)")

In [14]:
# Analyze industry-level revenue and revenue growth
Ind_total_rev = df1.groupby('Industry', observed = False)['Revenue (USD millions)'].sum().reset_index()
Avg_Rev_growth = df1.groupby('Industry',observed=False)['Revenue growth'].mean().reset_index()


# Merge industry data
Industry_data = pd.merge(Ind_total_rev,Avg_Rev_growth,on='Industry')

In [None]:
# Select top 10 industries by revenue
Industry_data['Revenue (USD millions)'] = Industry_data['Revenue (USD millions)'].astype(int)
Industry_data['Industry'] = Industry_data['Industry'].astype('object')
Top_industries = Industry_data.sort_values(by='Revenue (USD millions)',ascending=False).head(10)

# Define a custom color palette with different colors
custom_palette = sns.color_palette("tab20", 50)

# Create bar plot for top industries
plt.figure(figsize=(19, 7)) # Adjust figure size
ax1 = sns.barplot(x='Industry', y='Revenue (USD millions)', data=Top_industries, palette= custom_palette, legend=False)

# Define custom number formatting function
def format_large_numbers(x, pos):
    return '{:,.0f}'.format(x)

# Apply the custom formatter to y-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_large_numbers))

# Annotate bars with values
for p in ax.patches:
    ax1.text(p.get_x() + p.get_width() / 2,   # X-coordinate (center of bar)
            p.get_height()-5,                 # Y-coordinate (slightly below the top)
            f'{int(p.get_height())}',         # Text (value)
            ha='center',                      # Horizontal alignment
            va='baseline',                    # Vertical alignment
            fontsize=12, 
            color='black',                    # White text for contrast
            fontweight='bold')

# Labels and title
plt.xticks(rotation=45,ha='right')
plt.xlabel("Industries")
plt.ylabel("Revenue (USD Millions)")
plt.title("Top 10 Industries - Revenue(USD Millions)")


In [None]:
# Define a custom color palette for the industries
palette = sns.color_palette("tab20", 50)

# Create a scatter plot for Revenue vs Revenue Growth
plt.figure(figsize=(12, 6))
ax = sns.scatterplot(data=Industry_data, x='Revenue (USD millions)', y='Revenue growth', 
                size='Revenue (USD millions)', hue='Industry', 
                sizes=(50, 2000), legend=False,palette=palette)

# Custom function to format the x-axis labels (remove scientific notation)
def format_large_numbers(x, pos):
    return '{:,.0f}'.format(x)

# Apply the custom formatter to x-axis
plt.gca().xaxis.set_major_formatter(FuncFormatter(format_large_numbers))

# Add grid within the plot
plt.grid(True, which='both', linestyle='--', linewidth=1, alpha=1)  # Customize the grid style

# Add Labels and title
plt.title('Revenue vs Revenue Growth by Industry')
plt.xlabel('Revenue (USD millions)')
plt.ylabel('Revenue Growth (%)')

# Show plot
plt.show()

In [None]:
# Count number of companies by state
No_of_companies = df1.groupby('State',observed=False)['Name'].count().reset_index()
No_of_companies['State'] = No_of_companies['State'].astype('object')
top_10_states = No_of_companies.sort_values(by='Name',ascending=False).head(10)

# Create bar plot for state-wise company count
plt.figure(figsize=(20, 6))
ax=sns.barplot(x='State', y='Name', data= top_10_states, hue=No_of_companies['State'], palette="coolwarm", legend=False)

# Annotate bars with values
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='center', 
                xytext=(0, 5),  # Adjust vertical position of text
                textcoords='offset points')

# Add Labels and title
plt.xticks(rotation=45)
plt.title('State vs No. of Companies')
plt.xlabel('Sates')
plt.ylabel('Number of Fortune 100 companies')

In [None]:
# Create a pie chart for employee distribution by industry
Employee_count = df1.groupby('Industry',observed=False)['Employees'].sum().reset_index()

# Select top 10 industries
top_10 = Employee_count.sort_values(by='Employees', ascending=False).head(10)

colors = sns.color_palette("pastel")

# Apply seaborn style
sns.set_style("whitegrid")

# Create pie chart
plt.figure(figsize=(6, 6))
plt.pie(top_10['Employees'], labels=top_10['Industry'], autopct='%1.1f%%', colors=colors, startangle=140, wedgeprops={'edgecolor': 'black'})

# Display the chart
plt.title("Top 10 Industries by Employee Count")
plt.show()

In [None]:
# Create a correlation matrix heatmap
num_df = df1.select_dtypes(include=['int64', 'float64'])
corr_df = num_df.corr()

#Plot Heatmap 
sns.heatmap(corr_df,annot=True,cmap='coolwarm')

#Add Titles and labels
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Scatter plot for Employees vs Revenue
RR_df = df1[["Employees", "Revenue (USD millions)"]]

plt.figure(figsize=(10,5))
sns.regplot(x='Revenue (USD millions)', y='Employees', data=RR_df, scatter_kws={'alpha':0.5}, line_kws={'color':'red'})

# Apply the custom formatter to x-axis
plt.gca().yaxis.set_major_formatter(FuncFormatter(format_large_numbers))

plt.title('No.of Employees vs Revenue (USD millions)')
plt.xlabel('Revenue (USD millions)')
plt.ylabel('No.of Employees')
plt.show()