In [189]:
import pandas as pd
import numpy as np

In [190]:
startup_df = pd.read_csv("startup_data.csv")
display(startup_df.head(10))

Unnamed: 0,state_code,latitude,longitude,founded_at,age_first_funding_year,age_last_funding_year,funding_rounds,funding_total_usd,category_code,has_VC,has_angel,has_seriesA,has_seriesB,has_seriesC,has_seriesD,avg_participants,is_top500,status
0,CA,42.35888,-71.05682,1/1/2007,2.2493,3.0027,3.0,375000,music,No,Yes,No,No,No,No,1.0,No,acquired
1,CA,37.238916,-121.973718,1/1/2000,5.126,9.9973,4.0,40100000,enterprise,Yes,No,No,Yes,Yes,Yes,4.75,Yes,acquired
2,CA,32.901049,-117.192656,3/18/2009,1.0329,1.0329,1.0,-2600000,web,No,No,Yes,No,No,No,4.0,Yes,acquired
3,CA,37.320309,-122.05004,1/1/2002,3.1315,5.3151,3.0,40000000,software,No,No,No,Yes,Yes,Yes,3.3333,Yes,acquired
4,CA,37.779281,-122.419236,8/1/2010,,1.6685,2.0,1300000,games_video,Yes,Yes,No,No,No,No,1.0,Yes,closed
5,CA,37.406914,-122.09037,1/1/2002,,4.5452,1.0,7500000,network_hosting,No,No,No,Yes,No,No,3.0,Yes,closed
6,CA,37.391559,-122.070264,1/1/2005,1.7205,5.211,3.0,26000000000,software,Yes,No,Yes,Yes,No,No,1.6667,Yes,acquired
7,CA,38.057107,-122.513742,1/1/2004,1.6466,6.7616,3.0,34100000,,No,No,Yes,Yes,No,Yes,3.5,Yes,acquired
8,MA,42.712207,-73.203599,1/1/2002,3.5863,11.1123,3.0,9650000,,Yes,No,Yes,No,No,Yes,,Yes,acquired
9,CA,37.427235,-122.145783,6/1/2005,1.6712,4.6849,3.0,5750000,,Yes,Yes,Yes,No,No,No,,Yes,acquired


## Data Description

- state_code: kode state startup
- latitude: posisi latitude startup
- longitude: posisi longitude startup
- founded_at: tanggal ketika startup tersebut didirikan
- age_first_funding_year: umur startup dalam tahun ketika pertama kali mendapatkan funding
- age_last_funding_year: umur startup dalam tahun ketika terakhir kali mendapatkan funding
- funding_rounds: banyaknya funding yang diterima oleh startup 
- funding_total_usd: jumlah funding yang diterima oleh startup dalam USD
- category_code: bidang yang menjadi fokus dari startup
- has_VC: apakah startup tersebut memiliki venture capital
- has_angel: apakah startup tersebut memiliki angel investor
- has_seriesA: apakah startup tersebut mendapatkan funding series A
- has_seriesB: apakah startup tersebut mendapatkan funding series B
- has_seriesC: apakah startup tersebut mendapatkan funding series C
- has_seriesD: apakah startup tersebut mendapatkan funding series D
- avg_participants: rata-rata banyak pengguna dari startup tersebut dalam juta
- is_top500: apakah startup tersebut pernah masuk ke dalam 500 startup dengan peringkat teratas di Amerika
- status (target): status dari startup tersebut sekarang, acquired berarti startup tersebut berhasil karena diakuisisi oleh organisasi lain, sebaliknya, closed berarti startup tersebut sudah berhenti beroperasi dan gagal

In [191]:
startup_df.dtypes

state_code                 object
latitude                  float64
longitude                 float64
founded_at                 object
age_first_funding_year    float64
age_last_funding_year     float64
funding_rounds            float64
funding_total_usd           int64
category_code              object
has_VC                     object
has_angel                  object
has_seriesA                object
has_seriesB                object
has_seriesC                object
has_seriesD                object
avg_participants          float64
is_top500                  object
status                     object
dtype: object

In [192]:
startup_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 949 entries, 0 to 948
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   state_code              949 non-null    object 
 1   latitude                949 non-null    float64
 2   longitude               949 non-null    float64
 3   founded_at              949 non-null    object 
 4   age_first_funding_year  885 non-null    float64
 5   age_last_funding_year   894 non-null    float64
 6   funding_rounds          946 non-null    float64
 7   funding_total_usd       949 non-null    int64  
 8   category_code           897 non-null    object 
 9   has_VC                  949 non-null    object 
 10  has_angel               949 non-null    object 
 11  has_seriesA             949 non-null    object 
 12  has_seriesB             949 non-null    object 
 13  has_seriesC             949 non-null    object 
 14  has_seriesD             949 non-null    ob

In [193]:
startup_df.describe()

Unnamed: 0,latitude,longitude,age_first_funding_year,age_last_funding_year,funding_rounds,funding_total_usd,avg_participants
count,949.0,949.0,885.0,894.0,946.0,949.0,901.0
mean,38.530398,-103.465916,2.322209,3.987377,2.313953,1544590000.0,2.857545
std,3.75615,22.382011,2.941926,3.313172,1.402098,24466760000.0,1.875341
min,25.752358,-122.756956,-9.0466,-9.0466,1.0,-9500000.0,1.0
25%,37.388869,-122.200914,0.5178,1.690375,1.0,2700000.0,1.5
50%,37.779281,-118.354605,1.4247,3.4945,2.0,10000000.0,2.5
75%,40.730646,-77.212493,3.6027,5.54315,3.0,24900000.0,4.0
max,59.335232,18.057121,34.4904,34.4904,10.0,642310000000.0,16.0


In [194]:
startup_df.shape

(949, 18)

## Data Preprocessing

* Redundant Data

In [195]:
print(f"Jumlah data redundan {startup_df.duplicated().sum()}")

Jumlah data redundan 26


In [196]:
startup_df.drop_duplicates(inplace=True)

In [197]:
startup_df.shape

(923, 18)

In [198]:
print(f"Jumlah data redundan {startup_df.duplicated().sum()}")

Jumlah data redundan 0


* Missing values

In [199]:
startup_df.isnull().sum()

state_code                 0
latitude                   0
longitude                  0
founded_at                 0
age_first_funding_year    60
age_last_funding_year     55
funding_rounds             3
funding_total_usd          0
category_code             48
has_VC                     0
has_angel                  0
has_seriesA                0
has_seriesB                0
has_seriesC                0
has_seriesD                0
avg_participants          48
is_top500                  0
status                     0
dtype: int64

Since the sum of null values are much enough, we can't drop the rows. Hence, we fill the null values with mean value as not disrupt the analysis. 

In [200]:
mean_affy = startup_df['age_first_funding_year'].mean()
startup_df['age_first_funding_year'].fillna(mean_affy, inplace=True)

In [201]:
mean_alfy = startup_df['age_last_funding_year'].mean()
startup_df['age_last_funding_year'].fillna(mean_alfy, inplace=True)

In [202]:
startup_df['category_code'].value_counts().head(1)
startup_df['category_code'].fillna('web', inplace=True)

In [203]:
mean_ap = startup_df['avg_participants'].mean()
startup_df['avg_participants'].fillna(mean_ap, inplace=True)

In [204]:
startup_df.dropna(inplace=True)

In [205]:
startup_df.isnull().sum()

state_code                0
latitude                  0
longitude                 0
founded_at                0
age_first_funding_year    0
age_last_funding_year     0
funding_rounds            0
funding_total_usd         0
category_code             0
has_VC                    0
has_angel                 0
has_seriesA               0
has_seriesB               0
has_seriesC               0
has_seriesD               0
avg_participants          0
is_top500                 0
status                    0
dtype: int64

## Exploratory Data Analysis

In [206]:
df2 = startup_df[['age_first_funding_year', 'age_last_funding_year', 'funding_total_usd']]
df2.describe()


Unnamed: 0,age_first_funding_year,age_last_funding_year,funding_total_usd
count,920.0,920.0,920.0
mean,2.32642,4.000726,1592146000.0
std,2.847659,3.216783,24848300000.0
min,-9.0466,-9.0466,-9500000.0
25%,0.5863,1.86985,2737500.0
50%,1.6712,3.74245,10000000.0
75%,3.38835,5.332875,24925000.0
max,34.4904,34.4904,642310000000.0


There are anomalies data where the minimum value of age_first_funding_year and age_last_funding_year is minus. Let's investigate it

In [211]:
startup_df['age_first_funding_year'].dtype

dtype('float64')

In [218]:
startup_df.loc(startup_df['age_first_funding_year'] == -9.046600)

TypeError: unhashable type: 'Series'