# Data Analysis Commands

In [1]:
import pandas as pd

# Extract data and remove NaNs
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [2]:
# Handle categorical vars by turning them into dummy indicator vars
df = pd.get_dummies(df, drop_first = True)
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


In [3]:
# Find mean, std, median, min, max of column
df_mean = df[['Administration']].mean()[0]
df_std = df[['Administration']].std()[0]
df_median = df[['Administration']].median()[0]
df_min = df[['Administration']].min()[0]
df_max = df[['Administration']].max()[0]

print("Mean = ", df_mean)
print("Standard Deviation = ", df_std)
print("Median = ", df_median)
print("Min = ", df_min)
print("Max = ", df_max)

Mean =  120879.51020408157
Standard Deviation =  28112.430588104202
Median =  122616.84
Min =  51283.14
Max =  182645.56


In [4]:
# Find 4th 5-quantile of column
df[['Profit']].quantile(q = 4/5)[0]

146849.55200000003

In [5]:
# Find rows that satisfy multiple conditions
sub_df = df[(df['Administration'] > 100000) & (df['R&D Spend'] > 130000)]
sub_df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
6,134615.46,147198.87,127716.82,156122.51,0,0


In [6]:
# Sort by column (ascending = True by default)
sub_df = sub_df.sort_values(by = 'Profit', ascending = False)
sub_df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
6,134615.46,147198.87,127716.82,156122.51,0,0


In [7]:
# Count number of rows
sub_df[sub_df.columns[0]].count()

6

In [8]:
# Remove NaNs
df = df.dropna()
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0
