In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter

# Add company info

In [None]:
company_original = pd.read_csv("companies.csv")

In [None]:
company_original.head()

In [None]:

company_original.info()

In [None]:
final_df = company_original.drop(columns = ['name','homepage_url','state_code','region',
                                           'city'])

In [None]:
final_df.info()

In [None]:
final_df.head()

# Convert funding_total_usd from string to numeric

In [None]:
final_df['funding_total_usd'] = pd.to_numeric(final_df['funding_total_usd'], errors='coerce')

# fill null or Na values to mode values for all features having NA

In [None]:
final_df['category_list'].fillna(final_df['category_list'].mode()[0], inplace=True)
final_df['country_code'].fillna(final_df['country_code'].mode()[0], inplace=True)
final_df['funding_rounds'].fillna(final_df['funding_rounds'].mode()[0], inplace=True)
final_df['founded_at'].fillna(final_df['founded_at'].mode()[0], inplace=True)
#final_df['funding_round_type_y'].fillna(final_df['funding_round_type_y'].mode()[0], inplace=True)
#final_df['funding_round_code_x'].fillna(final_df['funding_round_code_x'].mode()[0], inplace=True)
#final_df['raised_amount_usd_y'].fillna(final_df['raised_amount_usd_y'].mode()[0], inplace=True)
final_df['funding_total_usd'].fillna(final_df['funding_total_usd'].mode()[0], inplace=True)
#final_df['Num_of_investors'].fillna(final_df['Num_of_investors'].mode()[0], inplace=True)

In [None]:
final_df['status'].unique()

In [None]:
final_df.info()

# Remove operating status from the final dataset

In [None]:
final_df = final_df[final_df.status != 'operating']
final_df.info()

In [None]:
final_df.fillna(0)

# Replace first funding date which are null with the date of last funding to make sense

In [None]:
final_df[final_df['first_funding_at'].isnull()]

In [None]:
final_df['first_funding_at']=np.where(final_df['permalink']=="/organization/motionmetrics","2014-09-01",final_df['first_funding_at'])
final_df['first_funding_at']=np.where(final_df['permalink']=="/organization/topicmarks","2011-03-18",final_df['first_funding_at']) 

# Replace founded at date with the first funding date where there is abnormal dates

In [None]:
final_df['founded_at']=np.where(final_df['permalink']=="/organization/rent2cash-com","2014-01-01",final_df['founded_at'])
final_df['founded_at']=np.where(final_df['permalink']=="/organization/livamp-2","2014-09-21",final_df['founded_at'])

# Scale funding duration to millions and thousands

In [None]:
final_df['funding_total_usd'] = pd.to_numeric(final_df['funding_total_usd'], errors='coerce')
#median = filter_data['funding_total_usd'].median()
#companies.info()

final_df['fundind_total_k$'] = final_df['funding_total_usd']/1000
final_df['funding_total_m$'] = final_df['funding_total_usd']/1000000
final_df.info()
final_df.head()




## Create labels

In [None]:
final_df['label'] = 0
final_df.loc[final_df.status == 'ipo', 'label'] = 1 # add 1 to the label column with status ipo or acquired
final_df.loc[final_df.status == 'acquired', 'label'] = 1

final_df.info()

## Split the data into train and test set 

In [None]:
import numpy as np
from sklearn.model_selection import KFold

X = final_df.drop(columns=['label'])
y = final_df['label']
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X): # 80 20 % split
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

In [None]:
X_train

In [None]:
X_test

# Adding new features to the train and test dataset

In [None]:
t1=pd.to_datetime(X_train['first_funding_at'], errors = 'coerce')

#t2 = pd.to_datetime(combined_company_investment_acquisition_rounds['last_funding_at'])
t2=pd.to_datetime(X_train['last_funding_at'], errors = 'coerce')

X_train['funding_duration']=t2-t1
X_train['funding_duration_days']=X_train['funding_duration'].dt.days  ## This takes care of the duratrion that will remove "days" word in funding_duration column

#company_investment_acquisition_rounds['funding_duration_year']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[Y]') # use this when you want to see Year in as 0.0 when month is less than 12 becuase if less than 12 then year get displayed as 0.0
X_train['funding_duration_year']=X_train['funding_duration'].dt.days/364.0   # use this when you want to see year as 0.6 for months less than 12 

#company_investment_acquisition_rounds['funding_duration_month']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[M]') # use this when you want to see 0.0 for month when days <30
#company_investment_acquisition_rounds['funding_duration_month']=(company_investment_acquisition_rounds['funding_duration_year'])*364.0/30.0   # use this to see 0.034 as month when days <30
X_train['funding_duration_month']=(X_train['funding_duration_days'])/30.0   

In [None]:
t1=pd.to_datetime(X_test['first_funding_at'], errors = 'coerce')

#t2 = pd.to_datetime(combined_company_investment_acquisition_rounds['last_funding_at'])
t2=pd.to_datetime(X_test['last_funding_at'], errors = 'coerce')

X_test['funding_duration']=t2-t1
X_test['funding_duration_days']=X_test['funding_duration'].dt.days  ## This takes care of the duratrion that will remove "days" word in funding_duration column

#company_investment_acquisition_rounds['funding_duration_year']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[Y]') # use this when you want to see Year in as 0.0 when month is less than 12 becuase if less than 12 then year get displayed as 0.0
X_test['funding_duration_year']=X_test['funding_duration'].dt.days/364.0   # use this when you want to see year as 0.6 for months less than 12 

#company_investment_acquisition_rounds['funding_duration_month']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[M]') # use this when you want to see 0.0 for month when days <30
#company_investment_acquisition_rounds['funding_duration_month']=(company_investment_acquisition_rounds['funding_duration_year'])*364.0/30.0   # use this to see 0.034 as month when days <30
X_test['funding_duration_month']=(X_test['funding_duration_days'])/30.0   


In [None]:
p1 = pd.DatetimeIndex(X_train['first_funding_at']).year
p2 = pd.DatetimeIndex(X_train['last_funding_at']).year
X_train['Avg_duration_of_funding']=p2-p1
temp = X_train['Avg_duration_of_funding'] /2
X_train['Avg_funding_in_year'] = temp

X_train.head()

In [None]:
p1 = pd.DatetimeIndex(X_test['first_funding_at']).year
p2 = pd.DatetimeIndex(X_test['last_funding_at']).year
X_test['Avg_duration_of_funding']=p2-p1
temp = X_test['Avg_duration_of_funding'] /2
X_test['Avg_funding_in_year'] = temp

X_test.head()

In [None]:
X_train[["founded_at_year", "founded_at_month", "founded_at_day"]] = X_train["founded_at"].str.split("-", expand = True)
#print("\nNew DataFrame:")
X_train[["first_funding_year", "first_funding_month", "first_funding_day"]] = X_train["first_funding_at"].str.split("-", expand = True)

X_train[["last_funding_year", "last_funding_month", "last_funding_day"]] = X_train["last_funding_at"].str.split("-", expand = True)

#X_train[["funded_year", "funded_month", "funded_day"]] = X_train["funded_at_y"].str.split("-", expand = True)

X_train.head()

In [None]:
X_test[["founded_at_year", "founded_at_month", "founded_at_day"]] = X_test["founded_at"].str.split("-", expand = True)
#print("\nNew DataFrame:")
X_test[["first_funding_year", "first_funding_month", "first_funding_day"]] = X_test["first_funding_at"].str.split("-", expand = True)

X_test[["last_funding_year", "last_funding_month", "last_funding_day"]] = X_test["last_funding_at"].str.split("-", expand = True)

#X_test[["funded_year", "funded_month", "funded_day"]] = X_test["funded_at_y"].str.split("-", expand = True)

X_test.head()

In [None]:
X_train['founded_at_day'] = pd.to_numeric(X_train['founded_at_day'], errors='coerce')
X_train['founded_at_month'] = pd.to_numeric(X_train['founded_at_month'], errors='coerce')
X_train['founded_at_year'] = pd.to_numeric(X_train['founded_at_year'], errors='coerce')


X_train['first_funding_day'] = pd.to_numeric(X_train['first_funding_day'], errors='coerce')
X_train['first_funding_month'] = pd.to_numeric(X_train['first_funding_month'], errors='coerce')
X_train['first_funding_year'] = pd.to_numeric(X_train['first_funding_year'], errors='coerce')


X_train['last_funding_day'] = pd.to_numeric(X_train['last_funding_day'], errors='coerce')
X_train['last_funding_month'] = pd.to_numeric(X_train['last_funding_month'], errors='coerce')
X_train['last_funding_year'] = pd.to_numeric(X_train['last_funding_year'], errors='coerce')

# X_train['funded_year'] = pd.to_numeric(X_train['funded_year'], errors='coerce')
# X_train['funded_month'] = pd.to_numeric(X_train['funded_month'], errors='coerce')
# X_train['funded_day'] = pd.to_numeric(X_train['funded_day'], errors='coerce')

In [None]:
X_test['founded_at_day'] = pd.to_numeric(X_test['founded_at_day'], errors='coerce')
X_test['founded_at_month'] = pd.to_numeric(X_test['founded_at_month'], errors='coerce')
X_test['founded_at_year'] = pd.to_numeric(X_test['founded_at_year'], errors='coerce')


X_test['first_funding_day'] = pd.to_numeric(X_test['first_funding_day'], errors='coerce')
X_test['first_funding_month'] = pd.to_numeric(X_test['first_funding_month'], errors='coerce')
X_test['first_funding_year'] = pd.to_numeric(X_test['first_funding_year'], errors='coerce')


X_test['last_funding_day'] = pd.to_numeric(X_test['last_funding_day'], errors='coerce')
X_test['last_funding_month'] = pd.to_numeric(X_test['last_funding_month'], errors='coerce')
X_test['last_funding_year'] = pd.to_numeric(X_test['last_funding_year'], errors='coerce')

# X_test['funded_year'] = pd.to_numeric(X_test['funded_year'], errors='coerce')
# X_test['funded_month'] = pd.to_numeric(X_test['funded_month'], errors='coerce')
# X_test['funded_day'] = pd.to_numeric(X_test['funded_day'], errors='coerce')

In [None]:
category_split = X_train["category_list"].str.split("|", n = 1, expand = True)
#print(category_split)
X_train["category_1"] = category_split[0]
#company["category_2"] = category_split[1]

X_train.head()
X_train.info()

In [None]:
category_split = X_test["category_list"].str.split("|", n = 1, expand = True)
#print(category_split)
X_test["category_1"] = category_split[0]
#company["category_2"] = category_split[1]

X_test.head()
X_test.info()

In [None]:
X_train_new = X_train.drop(columns=['permalink', 'status','founded_at','first_funding_at','last_funding_at','funding_duration'
                                 ,'category_list'])

X_train_new.info()  #10668


In [None]:
X_test_new = X_test.drop(columns=['permalink', 'status','founded_at','first_funding_at','last_funding_at','funding_duration'
                                 ,'category_list'])

X_test_new.info()  #2666

# Keep the Top 5 Countries and put the rest in Others

In [None]:
X_train_new['country_code'] = X_train_new['country_code'].astype('category')

others = X_train_new['country_code'].value_counts().index[5:]
label1 = 'other countries'

X_train_new['country_code'] = X_train_new['country_code'].cat.add_categories([label1])
X_train_new['country_code'] = X_train_new['country_code'].replace(others, label1)

#X_test['country_code'] = X_test['country_code'].cat.add_categories([label1])


In [None]:
X_test_new['country_code'] = X_test_new['country_code'].astype('category')
others1 = X_test_new['country_code'].value_counts().index[5:]
labels1 = 'Other countries'
#X_test['country_code'] = X_test['country_code'].cat.add_categories([label1])
X_test_new['country_code'] = X_test_new['country_code'].replace(others1, labels1)

In [None]:
 X_train_new.country_code.unique()

In [None]:
X_test_new.country_code.unique()

In [None]:
X_test_new.head()

# Lebel Encode funding round type and code 

In [None]:
from sklearn import preprocessing
# train, test = ... # SEPARATE YOUR DATA AS YOU WANT
# le = preprocessing.LabelEncoder()
# trained_le = le.fit(train)
# train = trained_le.transform(train)
# test = trained_le.transform(test)

# from sklearn.preprocessing import OrdinalEncoder
# encoder = OrdinalEncoder(handle_unknown='ignore', unknown_value=np.nan)

le = preprocessing.LabelEncoder()
#encode_cols = ['funding_round_type']

# trained_le = le.fit(X_train_new[encode_cols])
# X_train_new = trained_le.transform(X_train_new[encode_cols])
# X_test_new = trained_le.transform(X_test_new[encode_cols])

In [None]:
 X_train_new.funding_round_type.unique()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(X_train_new['country_code'])
X_test_new['country_code'] = X_test_new['country_code'].map(lambda s: '<unknown>' if s not in le.classes_ else s)
le.classes_ = np.append(le.classes_, '<unknown>')
X_train_new['country_code'] = le.transform(X_train_new['country_code'])
X_test_new['country_code'] = le.transform(X_test_new['country_code'])

In [None]:
X_test_new.head()

In [None]:
X_test_new.info()

In [None]:
X_train_new.info()

# tokenzing Country code, company domiain and category features using bag of words

# tokenizing category and counting the occurance ###########

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words ='english',analyzer='word')
#cv = CountVectorizer(parameters desired)

train_1 = vectorizer.fit_transform(X_train_new['category_1'])
train_x = pd.DataFrame(train_1.toarray(), columns = vectorizer.get_feature_names()) 
test_1 = vectorizer.transform(X_test_new['category_1'])
test_x = pd.DataFrame(test_1.toarray(), columns = vectorizer.get_feature_names()) 

In [None]:
train_x.info()

In [None]:
test_x.info()

In [None]:
X_train_new.reset_index(drop=True, inplace=True)
res = pd.concat([X_train_new, train_x], axis = 1)
res.head()
X_train_res =res.drop(columns=['category_1'])

In [None]:
X_train_res.head()

In [None]:
X_test_new.reset_index(drop=True, inplace=True)

reset = pd.concat([X_test_new, test_x], axis = 1)
#res.head()
X_test_res =reset.drop(columns=['category_1'])

In [None]:
X_test_res.info()

In [None]:
X = pd.concat([X_train_res, X_test_res], axis=0).reset_index(drop=True)

In [None]:
y = pd.concat([y_train, y_test], axis=0).reset_index(drop=True)

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X = X.loc[:,~X.columns.duplicated()].copy()
X

In [None]:
# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
##import eli5
#from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier

In [None]:
# Train-validation split
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y,train_size=0.8,test_size=0.2,random_state=0)

In [None]:
from sklearn.model_selection import RepeatedKFold

In [None]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression(random_state = 42, max_iter = 10**6)))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 91)))
models.append(('DT', DecisionTreeClassifier(random_state=42)))
#models.append(('NB', GaussianNB()))
models.append(('XGB', XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')))
models.append(('Adaboost', AdaBoostClassifier(n_estimators=100, random_state=0)))
models.append(('RFC-100',RandomForestClassifier(n_estimators = 100,random_state=521)))
models.append(('RFC-200',RandomForestClassifier(n_estimators = 200,random_state=521)))
models.append
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
plt.grid()
ax.set_xticklabels(names)
plt.show()

In [None]:
fig.savefig('Algorithm_comparison_baseline.png')

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
#from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn import ensemble
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, auc, f1_score
from scipy import interp
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
%matplotlib inline 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn import ensemble
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, auc, f1_score
from scipy import interp
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
%matplotlib inline 

## ************ Differet way of testing model accuracy and auc average *****************************

In [None]:
############################LR model ##########################################

import matplotlib.patches as patches
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
accuracy_model_a = []
model_a = LogisticRegression(random_state=42, max_iter= 10**6)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_a.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ya_pred = model_a.predict(X_test)
    accuracy_model_a.append(accuracy_score(y_test, ya_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_a)

print ("Avg accuracy for LR:  ", np.array(accuracy_model_a).mean())
cm = confusion_matrix(y_test, ya_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ya_pred))

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_d = []
model_d = DecisionTreeClassifier(random_state=42)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, ye_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_d)

print ("Avg accuracy for Decision Tree :  ", np.array(accuracy_model_d).mean())
cm = confusion_matrix(y_test, ye_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ye_pred))

In [None]:
import matplotlib.patches as patches
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
accuracy_model_b = []
model_b = RandomForestClassifier(n_estimators=100, random_state=521)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_b.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb_pred = model_b.predict(X_test)
    accuracy_model_b.append(accuracy_score(y_test, yb_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_b)

print ("Avg accuracy for RF with 100 trees : ", np.array(accuracy_model_b).mean())
cm = confusion_matrix(y_test, yb_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yb_pred))

In [None]:
############################ KNN model ##########################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_c = []
model_c = KNeighborsClassifier(n_neighbors=9)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_c.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yc_pred = model_c.predict(X_test)
    accuracy_model_c.append(accuracy_score(y_test, yc_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_c)

print ("Avg accuracy for KNN : ", np.array(accuracy_model_c).mean())
cm = confusion_matrix(y_test, yc_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yc_pred))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_d = []
model_d = AdaBoostClassifier(random_state=42)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yd_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, yd_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_d)

print ("Avg accuracy for Adaboost ", np.array(accuracy_model_d).mean())
cm = confusion_matrix(y_test, yd_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yd_pred))

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_e = []
model_e = xgb.XGBClassifier(use_label_encoder =False, eval_metric='mlogloss' )

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_e.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_e.predict(X_test)
    accuracy_model_e.append(accuracy_score(y_test, ye_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_e)

print ("Avg accuracy for XGBoost :  ", np.array(accuracy_model_e).mean())
cm = confusion_matrix(y_test, ye_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ye_pred))

In [None]:
import matplotlib.patches as patches
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
accuracy_model_b = []
model_b = RandomForestClassifier(n_estimators=200, random_state=521)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_b.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb_pred = model_b.predict(X_test)
    accuracy_model_b.append(accuracy_score(y_test, yb_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_b)

print ("Avg accuracy for RF with 200 trees : ", np.array(accuracy_model_b).mean())
cm = confusion_matrix(y_test, yb_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yb_pred))

In [None]:
from sklearn.model_selection import RepeatedKFold 
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_e = []
model_e = xgb.XGBClassifier(use_label_encoder =False, eval_metric='mlogloss', )
accuracy_model_d = []
model_d = AdaBoostClassifier(random_state=42)
accuracy_model_c = []
model_c = KNeighborsClassifier(n_neighbors=9)
accuracy_model_b = []
model_b = RandomForestClassifier(n_estimators=100, random_state=521)
accuracy_model_b1 = []
model_b1 = RandomForestClassifier(n_estimators=200, random_state=521)
accuracy_model_a = []
model_a = LogisticRegression(random_state=42, max_iter= 10**6)
accuracy_model_f = []
model_f = DecisionTreeClassifier(random_state=42)

fig1 = plt.figure(figsize=[8,6])

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yd_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, yd_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC AdaBoost(AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)



tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_e.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_e.predict(X_test)
    accuracy_model_e.append(accuracy_score(y_test, ye_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='black',
         label=r'Mean ROC XGBoost (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_a.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ya_pred = model_e.predict(X_test)
    accuracy_model_a.append(accuracy_score(y_test, ya_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='green',
         label=r'Mean ROC LR, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_b.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb_pred = model_b.predict(X_test)
    accuracy_model_b.append(accuracy_score(y_test, yb_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='yellow',
         label=r'Mean ROC RF 100, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_b1.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb1_pred = model_b1.predict(X_test)
    accuracy_model_b1.append(accuracy_score(y_test, yb1_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='purple',
         label=r'Mean ROC RF 200, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_c.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yc_pred = model_c.predict(X_test)
    accuracy_model_c.append(accuracy_score(y_test, yc_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='orange',
         label=r'Mean ROC KNN, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_f.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yf_pred = model_f.predict(X_test)
    accuracy_model_f.append(accuracy_score(y_test, yf_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='magenta',
         label=r'Mean ROC DT, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)



plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=14)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC')
plt.legend(loc="lower right")
#plt.text(0.32,0.7,'More accurate area',fontsize = 12)
#plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

fig.savefig('roc_curve_baseline.png')

In [None]:
fig.savefig('multiple_roc_curve_baseline.png')