In [374]:
# import necessary packages
import numpy as np
import pandas as pd

Function definition

In [375]:
# Calculate measures of central tendency
def central_tendency(col):
    print(f"\tMean: {col.mean():.2f}")
    print(f"\tMedian: {col.median():.2f}")
    print(f"\tMode: {col.mode().values[0]:.2f}")
    return

# Calculate measures of spread
def measures_of_spread(col):
    print(f"\tRange: {col.max() - col.min():.2f}")
    print(f"\tVariance: {col.var():.2f}")
    print(f"\tStandard Deviation: {col.std():.2f}")
    return

# Get outliers lower bound
def get_cutoff_value_lower(col):
    IQR = col.quantile(0.75) - col.quantile(0.25)
    lower_bound_total_funding = col.quantile(0.25) - 1.5 * IQR
    return float(lower_bound_total_funding)

# Get outliers upper bound
def get_cutoff_value_upper(col):
    IQR = col.quantile(0.75) - col.quantile(0.25)
    upper_bound_total_funding = col.quantile(0.75) + 1.5 * IQR
    return float(upper_bound_total_funding)

# Drop outliers
def drop_outliers(col):
    df_filtered = df[(col >= get_cutoff_value_lower(col)) & (col <= get_cutoff_value_upper(col))]
    return df_filtered

# Impute outliers
def impute_outliers(col):
    lower_bound_funding_rounds = get_cutoff_value_lower(col)
    upper_bound_funding_rounds = get_cutoff_value_upper(col)
    mean_funding_rounds = col.mean()
    return col.apply(lambda x: mean_funding_rounds if x < lower_bound_funding_rounds or x > upper_bound_funding_rounds else x)

# Replace outliers
def capping_method(col):
    max_value = col.mean() + (3 * col.std())
    min_value = mean_funding_rounds - (3 * col.std())
    return col.clip(lower=min_value, upper=max_value)


Cleaning data import values for nulls, errors and column types

In [376]:
# import dataset
df = pd.read_csv('https://raw.githubusercontent.com/notpeter/crunchbase-data/master/companies.csv')
# df.head()


In [377]:
df['funding_total_usd'] = df['funding_total_usd'].replace('-', np.nan)
df['funding_total_usd'].fillna(0, inplace=True)
df['funding_total_usd'] = df['funding_total_usd'].astype(float)
df.loc[:, 'founded_at'] = df['first_funding_at']
df.dropna(subset=['name'], inplace=True)
# Remove comment to view null data from a specific column
# df[df['country_code'].isnull()]
# Remove comment to review total errors per column
# df.isnull().sum()

### Measuring Central Tendency and Spread

Calculate measures of central tendency and spread for a startup

In [378]:
print("Total Funding Raised:")
central_tendency(df['funding_total_usd'])
measures_of_spread(df['funding_total_usd'])

print("\nNumber of Funding Rounds:")
central_tendency(df['funding_rounds'])
measures_of_spread(df['funding_rounds'])

Total Funding Raised:
	Mean: 14919146.79
	Median: 1000000.00
	Mode: 0.00
	Range: 30079503000.00
	Variance: 28592863519409792.00
	Standard Deviation: 169094244.49

Number of Funding Rounds:
	Mean: 1.73
	Median: 1.00
	Mode: 1.00
	Range: 18.00
	Variance: 1.85
	Standard Deviation: 1.36


For "Total Funding Raised" the average funding raised by startups is $14,919,146 USD with an average of 1.7 rounds of funding per startup but it is highly likely that some startups have received exceptionally large funding amounts, which significantly skews the distribution. The mean is much higher than the median, indicating the presence of high-value outliers because probably few startups have received exceptionally large amounts of funding. The mode proves this too because there are many startups with 0 funding (but can also be due to missing data). The variance and range are very large which indicates a presence of outliers at the upper end of the distribution.

There is a lot less variation in the "Funding rounds" and when observing the difference of measures of central tendency which suggests that the data may be less affected by outliers. The measure of spread also contain moderate levels of variance which indicate that the outliers are not as noticeable.


### Outlier analysis

In [379]:
print("Funding USD Cutoff Values:")
print("\tLower Bound:", get_cutoff_value_lower(df['funding_total_usd'])) 
print("\tUpper Bound:", get_cutoff_value_upper(df['funding_total_usd'])) 

print("Funding Rounds Cutoff Values:")
print("\tLower Bound:", get_cutoff_value_lower(df['funding_rounds'])) 
print("\tUpper Bound:", get_cutoff_value_upper(df['funding_rounds'])) 

Funding USD Cutoff Values:
	Lower Bound: -10100968.25
	Upper Bound: 16941613.75
Funding Rounds Cutoff Values:
	Lower Bound: -0.5
	Upper Bound: 3.5


Dropping outliers...

In [380]:
print("Funding USD Central Tendency & Spread Values:")
a = drop_outliers(df['funding_total_usd'])
central_tendency(a['funding_total_usd'])
measures_of_spread(a['funding_total_usd'])

print("Funding Rounds Central Tendency & Spread Values:")
a = drop_outliers(df['funding_rounds'])
central_tendency(a['funding_rounds'])
measures_of_spread(a['funding_rounds'])

Funding USD Central Tendency & Spread Values:
	Mean: 2295979.82
	Median: 500000.00
	Mode: 0.00
	Range: 16940000.00
	Variance: 13652942299664.25
	Standard Deviation: 3694988.81
Funding Rounds Central Tendency & Spread Values:
	Mean: 1.39
	Median: 1.00
	Mode: 1.00
	Range: 2.00
	Variance: 0.42
	Standard Deviation: 0.65


Imputing the outliers...

In [381]:
print("Funding USD Central Tendency & Spread Values:")
df_impute = impute_outliers(df['funding_total_usd'])
central_tendency(df_impute)
measures_of_spread(df_impute)

print("Funding Rounds Central Tendency & Spread Values:")
df_impute = impute_outliers(df['funding_rounds'])
central_tendency(df_impute)
measures_of_spread(df_impute)

Funding USD Central Tendency & Spread Values:
	Mean: 4134667.04
	Median: 1000000.00
	Mode: 0.00
	Range: 16940000.00
	Variance: 31493812244873.62
	Standard Deviation: 5611934.80
Funding Rounds Central Tendency & Spread Values:
	Mean: 1.42
	Median: 1.00
	Mode: 1.00
	Range: 2.00
	Variance: 0.40
	Standard Deviation: 0.63


Replacing the outliers...

In [382]:
print("Funding USD Central Tendency & Spread Values:")
capping_method_usd = capping_method(df['funding_total_usd'])
central_tendency(capping_method_usd)
measures_of_spread(capping_method_usd)

print("Funding Rounds Central Tendency & Spread Values:")
capping_method_rounds = capping_method(df['funding_rounds'])
central_tendency(capping_method_rounds)
measures_of_spread(capping_method_rounds)

Funding USD Central Tendency & Spread Values:
	Mean: 12259954.54
	Median: 1000000.00
	Mode: 0.00
	Range: 522201880.26
	Variance: 1779142869819118.00
	Standard Deviation: 42179887.03
Funding Rounds Central Tendency & Spread Values:
	Mean: 1.69
	Median: 1.00
	Mode: 1.00
	Range: 4.81
	Variance: 1.37
	Standard Deviation: 1.17


We can see that each method provides different values of central tendency and measures of spread but there is little difference between different methods once we have dealt with outliers, no matter in which any way. It seems what is important is to first notice that there are outliers and then deal them in whichever way is best depending on the dataset and variance observed.

### Sampling Techniques
Let's drop all outliers and draw a random sample of 1000 for each variable and see the results.

In [383]:
random_sample_df = drop_outliers(df['funding_total_usd']).sample(n=1000) #random_state=42
central_tendency(random_sample_df['funding_total_usd'])
measures_of_spread(random_sample_df['funding_total_usd'])

random_sample_df = drop_outliers(df['funding_rounds']).sample(n=1000) #random_state=42
central_tendency(random_sample_df['funding_rounds'])
measures_of_spread(random_sample_df['funding_rounds'])

	Mean: 2181356.48
	Median: 456754.00
	Mode: 0.00
	Range: 16236400.00
	Variance: 12271093505899.47
	Standard Deviation: 3503012.06
	Mean: 1.40
	Median: 1.00
	Mode: 1.00
	Range: 2.00
	Variance: 0.44
	Standard Deviation: 0.66


We can see that depending on the sample selected we will get different results  but the results dont vary more than roughly 5%.

### Subsetting and Sampling
 Segment your dataset from Question 3 to get companies based only in United States and this time draw a series of samples randomly of a sufficient size, say, 10, 50, 100, 500, and 1000. Next, analyse the total funding amount for these startups based in United States using various methods of central tendency and measures of spread and report your observations.

In [391]:
random_sample_df = drop_outliers(df['funding_total_usd']).sample(n=1000)
usa_mask = df['country_code'] == 'USA'
print(random_sample_df[usa_mask & ~random_sample_df])

#central_tendency(random_sample_df['funding_total_usd'])
#measures_of_spread(random_sample_df['funding_total_usd'])

#random_sample_df = drop_outliers(df['funding_rounds']).sample(n=500)
#central_tendency(random_sample_df['funding_rounds'])
#measures_of_spread(random_sample_df['funding_rounds'])

TypeError: bad operand type for unary ~: 'str'