In [103]:
# import necessary packages
import numpy as np
import pandas as pd

Function definition

In [104]:
# Calculate measures of central tendency
def central_tendency(col):
    print(f"\tMean: {col.mean():.2f}")
    print(f"\tMedian: {col.median():.2f}")
    print(f"\tMode: {col.mode().values[0]:.2f}")
    return

# Calculate measures of spread
def measures_of_spread(col):
    print(f"\tRange: {col.max() - col.min():.2f}")
    print(f"\tVariance: {col.var():.2f}")
    print(f"\tStandard Deviation: {col.std():.2f}")
    return

# Get outliers lower bound
def get_cutoff_value_lower(col):
    IQR = col.quantile(0.75) - col.quantile(0.25)
    lower_bound_total_funding = col.quantile(0.25) - 1.5 * IQR
    return lower_bound_total_funding

# Get outliers upper bound
def get_cutoff_value_upper(col):
    IQR = col.quantile(0.75) - col.quantile(0.25)
    upper_bound_total_funding = col.quantile(0.75) + 1.5 * IQR
    return upper_bound_total_funding

# Drop outliers
def drop_outliers(col):
    df_filtered = df[(col >= get_cutoff_value_lower(col)) & (col <= get_cutoff_value_upper(col))]
    return df_filtered

# Impute outliers
def impute_outliers(col):
    df_filtered = df[(col >= get_cutoff_value_lower(col)) & (col <= get_cutoff_value_upper(col))]
    return df_filtered

# Replace outliers
def replace_outliers(col):
    df_filtered = df[(col >= get_cutoff_value_lower(col)) & (col <= get_cutoff_value_upper(col))]
    return df_filtered


Cleaning data import values for nulls, errors and column types

In [105]:
# import dataset
df = pd.read_csv('https://raw.githubusercontent.com/notpeter/crunchbase-data/master/companies.csv')
# df.head()


In [106]:
df['funding_total_usd'] = df['funding_total_usd'].replace('-', np.nan)
df['funding_total_usd'].fillna(0, inplace=True)
df['funding_total_usd'] = df['funding_total_usd'].astype(float)
df.loc[:, 'founded_at'] = df['first_funding_at']
df.dropna(subset=['name'], inplace=True)
# Remove comment to view null data from a specific column
# df[df['country_code'].isnull()]
# Remove comment to review total errors per column
# df.isnull().sum()

### Measuring Central Tendency and Spread

Calculate measures of central tendency and spread for a startup

In [107]:
print("Total Funding Raised:")
central_tendency(df['funding_total_usd'])
measures_of_spread(df['funding_total_usd'])

print("\nNumber of Funding Rounds:")
central_tendency(df['funding_rounds'])
measures_of_spread(df['funding_rounds'])

Total Funding Raised:
	Mean: 14919146.79
	Median: 1000000.00
	Mode: 0.00
	Range: 30079503000.00
	Variance: 28592863519409792.00
	Standard Deviation: 169094244.49

Number of Funding Rounds:
	Mean: 1.73
	Median: 1.00
	Mode: 1.00
	Range: 18.00
	Variance: 1.85
	Standard Deviation: 1.36


For "Total Funding Raised" the average funding raised by startups is $14,919,146 USD with an average of 1.7 rounds of funding per startup but it is highly likely that some startups have received exceptionally large funding amounts, which significantly skews the distribution. The mean is much higher than the median, indicating the presence of high-value outliers because probably few startups have received exceptionally large amounts of funding. The mode proves this too because there are many startups with 0 funding (but can also be due to missing data). The variance and range are very large which indicates a presence of outliers at the upper end of the distribution.

There is a lot less variation in the "Funding rounds" and when observing the difference of measures of central tendency which suggests that the data may be less affected by outliers. The measure of spread also contain moderate levels of variance which indicate that the outliers are not as noticeable.


### Outlier analysis

In [108]:
print("Funding USD Cutoff Values:")
print("\tLower Bound:", get_cutoff_value_lower(df['funding_total_usd'])) 
print("\tUpper Bound:", get_cutoff_value_upper(df['funding_total_usd'])) 

print("Funding Rounds Cutoff Values:")
print("\tLower Bound:", get_cutoff_value_lower(df['funding_rounds'])) 
print("\tUpper Bound:", get_cutoff_value_upper(df['funding_rounds'])) 

Funding USD Cutoff Values:
	Lower Bound: -10100968.25
	Upper Bound: 16941613.75
Funding Rounds Cutoff Values:
	Lower Bound: -0.5
	Upper Bound: 3.5


Dropping outliers...

In [109]:
print("Funding USD Central Tendency & Spread Values:")
a = drop_outliers(df['funding_total_usd'])
central_tendency(a['funding_total_usd'])
measures_of_spread(a['funding_total_usd'])

print("Funding Rounds Central Tendency & Spread Values:")
a = drop_outliers(df['funding_rounds'])
central_tendency(a['funding_rounds'])
measures_of_spread(a['funding_rounds'])

Funding USD Central Tendency & Spread Values:
	Mean: 2295979.82
	Median: 500000.00
	Mode: 0.00
	Range: 16940000.00
	Variance: 13652942299664.25
	Standard Deviation: 3694988.81
Funding Rounds Central Tendency & Spread Values:
	Mean: 1.39
	Median: 1.00
	Mode: 1.00
	Range: 2.00
	Variance: 0.42
	Standard Deviation: 0.65


Imputing the outliers...

In [110]:
print("IMPUTE:")
#a = impute_outliers(df['funding_total_usd'])
#central_tendency(a['funding_total_usd'])
#measures_of_spread(a['funding_total_usd'])

# Calculate the IQR for Total Funding Raised
Q1_total_funding = df['funding_total_usd'].quantile(0.25)
Q3_total_funding = df['funding_total_usd'].quantile(0.75)
IQR_total_funding = Q3_total_funding - Q1_total_funding

# Calculate the lower and upper bounds for outliers
lower_bound_total_funding = Q1_total_funding - 1.5 * IQR_total_funding
upper_bound_total_funding = Q3_total_funding + 1.5 * IQR_total_funding

# Impute outliers in Total Funding Raised with the mean
df['funding_total_usd'] = df['funding_total_usd'].apply(lambda x: df['funding_total_usd'].mean() if x < lower_bound_total_funding or x > upper_bound_total_funding else x)
central_tendency(df['funding_total_usd'] )
measures_of_spread(df['funding_total_usd'] )
# print(df['funding_total_usd'].apply(lambda x: df['funding_total_usd'].mean() if x < get_cutoff_value_lower(df['funding_rounds']) or x > get_cutoff_value_upper(df['funding_rounds']) else x))


#print("Funding Rounds Central Tendency & Spread Values:")
#a = impute_outliers(df['funding_rounds'])
#central_tendency(a['funding_rounds'])
#measures_of_spread(a['funding_rounds'])

IMPUTE:
	Mean: 4134667.04
	Median: 1000000.00
	Mode: 0.00
	Range: 16940000.00
	Variance: 31493812244873.62
	Standard Deviation: 5611934.80


Replacing the outliers...

In [111]:
print("Funding USD Central Tendency & Spread Values:")
a = replace_outliers(df['funding_total_usd'])
central_tendency(a['funding_total_usd'])
measures_of_spread(a['funding_total_usd'])

print("Funding Rounds Central Tendency & Spread Values:")
a = replace_outliers(df['funding_rounds'])
central_tendency(a['funding_rounds'])
measures_of_spread(a['funding_rounds'])

Funding USD Central Tendency & Spread Values:
	Mean: 4134667.04
	Median: 1000000.00
	Mode: 0.00
	Range: 16940000.00
	Variance: 31493812244873.62
	Standard Deviation: 5611934.80
Funding Rounds Central Tendency & Spread Values:
	Mean: 1.39
	Median: 1.00
	Mode: 1.00
	Range: 2.00
	Variance: 0.42
	Standard Deviation: 0.65
