In [None]:
import numpy as np
import pandas as pd
pd.set_option("display.max_colwidth", 1000)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.max_rows", 1000)
import seaborn as sns
import scipy.stats as stats
import matplotlib.pyplot as plt

%matplotlib inline

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Importing the data set
data = pd.read_csv('startups.csv')

In [None]:
# Overview of startup data
print(f'data shape is {data.shape}')
data.head()

#### Defining the failure/sucess event as a 0/1 outcome y variable:

In [None]:
data['Company Status'].unique()

In [None]:
data['Company Status'].value_counts()

In [None]:
success = {'Acquisition', 'Went Public', 'Merger', 'LBO', 'Pending Acquisition'}
failure = {'Defunct', 'Bankruptcy - Chapter 11','Bankruptcy - Chapter 7'}
def get_status(x):
    if x in success:
        return 1
    elif x in failure:
        return 0
    else:
        return np.nan
data['Company_Status_Label'] = data['Company Status'].apply(get_status)

## drop company labels that are NAN
data.shape
data = data[pd.notna(data['Company_Status_Label'])]
data.shape


## Applying supervised learning methods:

In [None]:
# Subseting the numerical features
data_numerical = data.select_dtypes(include=['number'])
columns_to_drop = ['startfund', 'endfund']
data_numerical = data_numerical.drop(columns=columns_to_drop)
data_numerical.dtypes;

In [None]:
data_numerical.shape
data_numerical = data_numerical.dropna(how="any")
data_numerical.shape

#### Train test split, and feature scaling:

In [None]:
from sklearn.model_selection import train_test_split
X = data_numerical.drop(columns=['Company_Status_Label'])
y = data_numerical['Company_Status_Label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=11)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Logistic regression:

In [None]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print(f'Accuracy of Logistic regression classifier on training set: {logreg.score(X_train, y_train):.2f}')
print(f'Accuracy of Logistic regression classifier on testing set: {logreg.score(X_test, y_test):.2f}')

#### Decision tree:

In [None]:
from sklearn.tree import DecisionTreeClassifier
dectree = DecisionTreeClassifier()
dectree.fit(X_train, y_train)
print(f'Accuracy of Decision tree classifier on training set: {dectree.score(X_train, y_train):.2f}')
print(f'Accuracy of Decision tree classifier on testing set: {dectree.score(X_test, y_test):.2f}')

#### KNN:

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
print(f'Accuracy of K classifier on training set: {knn.score(X_train, y_train):.2f}')
print(f'Accuracy of KNN classifier on testing set: {knn.score(X_test, y_test):.2f}')

In [None]:
from sklearn import svm
clf = svm.SVC(decision_function_shape='ovo')
clf.fit(X_train, y_train)
print(f'Accuracy of SVM on training set: {knn.score(X_train, y_train):.2f}')
print(f'Accuracy of SVM classifier on testing set: {knn.score(X_test, y_test):.2f}')