In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter

# Add company info

In [None]:
company_original = pd.read_csv("companies.csv")

In [None]:
company_original.head()

In [None]:
company_original.loc[company_original['permalink'] == '/organization/goot']

In [None]:
#read 
company_domain = pd.read_csv("Companies_domain.csv")

In [None]:
#get Domain file to join on PERMALINK to get DOMAIN COLUMN FROM company_domain dataframe

company = pd.merge(company_original,company_domain[['permalink','Domain']],on='permalink', how='left')

In [None]:

company.info()

# Add investment data

In [None]:
investment_feature = pd.read_csv("investments.csv")
investment_feature.head()

In [None]:
investment_feature.info()

In [None]:
investment_feature.drop_duplicates(subset = ['company_permalink'])

In [None]:
company_investment = pd.merge(company,investment_feature, left_on = 'permalink', right_on = 'company_permalink', how = 'left') 

In [None]:
company_investment.info()

# New feature creation - Num of investor

In [None]:
num_of_invest = company_investment[['company_permalink',
                         'investor_permalink']].groupby(['company_permalink']).size().reset_index(name = 'Num_of_investors')

In [None]:
company_investment1 = pd.merge(company_investment,num_of_invest, left_on = 'permalink', right_on = 'company_permalink', how = 'left')# 66368 total row count


company_investment1.info()

In [None]:
final_company = company_investment1.drop_duplicates(subset = ['permalink'])

In [None]:
final_company.info()

# Add rounds datset

In [None]:
rounds_feature = pd.read_csv("rounds.csv")

In [None]:
rounds_feature.info()

In [None]:
rounddf = rounds_feature.drop_duplicates(subset = ['company_permalink']) #66368

In [None]:
rounddf.info()

In [None]:
company_inv_round = pd.merge(final_company,rounddf, left_on = 'permalink', right_on = 'company_permalink', how = 'left')# 66368 total row count


company_inv_round.info()

In [None]:
pd. set_option('display.max_columns', 500)
company_inv_round.head()

In [None]:
final_df = company_inv_round.drop(columns = ['company_permalink_x','company_name_x','company_country_code_x','company_state_code_x',
                                   'company_region_x','company_city_x','funding_round_permalink_x',
                                   'funding_round_type_x','funded_at_x','raised_amount_usd_x','company_permalink_y','company_name_y',
                                  'company_category_list_y','company_country_code_y','company_state_code_y','company_region_y',
                                   'company_city_y','funding_round_permalink_y','funding_round_code_y','company_permalink',
                                             'investor_state_code','investor_region','investor_city','company_category_list_x'])

In [None]:
final_df.info()

In [None]:
final_df.head(20)

# Convert funding_total_usd from string to numeric

In [None]:
final_df['funding_total_usd'] = pd.to_numeric(final_df['funding_total_usd'], errors='coerce')

# fill null or Na values to mode values for all features having NA

In [None]:
final_df['category_list'].fillna(final_df['category_list'].mode()[0], inplace=True)
final_df['country_code'].fillna(final_df['country_code'].mode()[0], inplace=True)
final_df['funding_rounds'].fillna(final_df['funding_rounds'].mode()[0], inplace=True)
final_df['founded_at'].fillna(final_df['founded_at'].mode()[0], inplace=True)
final_df['funding_round_type_y'].fillna(final_df['funding_round_type_y'].mode()[0], inplace=True)
final_df['funding_round_code_x'].fillna(final_df['funding_round_code_x'].mode()[0], inplace=True)
final_df['raised_amount_usd_y'].fillna(final_df['raised_amount_usd_y'].mode()[0], inplace=True)
final_df['funding_total_usd'].fillna(final_df['funding_total_usd'].mode()[0], inplace=True)
final_df['Num_of_investors'].fillna(final_df['Num_of_investors'].mode()[0], inplace=True)



In [None]:
final_df['status'].unique()

In [None]:
final_df.info()

# Remove operating status from the final dataset

In [None]:
final_df = final_df[final_df.status != 'operating']
final_df.info()

In [None]:
final_df.fillna(0)

In [None]:
final_df.loc[final_df['permalink'] == '/organization/goot']

In [None]:
final_df['Num_of_investors'].unique()

## ADD label

In [None]:
final_df['label'] = 0
final_df.loc[final_df.status == 'ipo', 'label'] = 1 # add 1 to the label column with status ipo or acquired
final_df.loc[final_df.status == 'acquired', 'label'] = 1

final_df.info()

## split data into train and test 

In [None]:
import numpy as np
from sklearn.model_selection import KFold

X = final_df.drop(columns=['label'])
y = final_df['label']
kf = KFold(n_splits=5)

for train_index, test_index in kf.split(X): # 80 20 % split
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

## Work on train test and add new features, statistical analysis etc

# Replace first funding date which are null with the date of last funding to make sense

In [None]:
X_train[X_train['first_funding_at'].isnull()]

In [None]:
X_test[X_test['first_funding_at'].isnull()]

In [None]:
X_train['first_funding_at']=np.where(X_train['permalink']=="/organization/motionmetrics","2014-09-01",X_train['first_funding_at'])
X_test['first_funding_at']=np.where(X_test['permalink']=="/organization/topicmarks","2011-03-18",X_test['first_funding_at']) 

# Replace founded at date with the first funding date where there is abnormal dates

In [None]:
X_train['founded_at']=np.where(X_train['permalink']=="/organization/rent2cash-com","2014-01-01",X_train['founded_at'])
X_train['founded_at']=np.where(X_train['permalink']=="/organization/livamp-2","2014-09-21",X_train['founded_at'])

# Scale funding duration to millions and thousands

In [None]:
X_train['funding_total_usd'] = pd.to_numeric(X_train['funding_total_usd'], errors='coerce')
#median = filter_data['funding_total_usd'].median()
#filter_data['funding_total_usd'].fillna(median,inplace = True)

#companies.info()

X_train['fundind_total_k$'] = X_train['funding_total_usd']/1000
X_train['funding_total_m$'] = X_train['funding_total_usd']/1000000
X_train['funding_total_b$'] = X_train['funding_total_usd']/1000000000
X_train.info()
X_train.head()

In [None]:
X_test['fundind_total_k$'] = X_test['funding_total_usd']/1000
X_test['funding_total_m$'] = X_test['funding_total_usd']/1000000
X_test['funding_total_b$'] = X_test['funding_total_usd']/1000000000
#X_train.info()
#X_train.head()

# Adding new features to the dataset train and test

In [None]:
t1=pd.to_datetime(X_train['first_funding_at'], errors = 'coerce')

#t2 = pd.to_datetime(combined_company_investment_acquisition_rounds['last_funding_at'])
t2=pd.to_datetime(X_train['last_funding_at'], errors = 'coerce')

X_train['funding_duration']=t2-t1
X_train['funding_duration_days']=X_train['funding_duration'].dt.days  ## This takes care of the duratrion that will remove "days" word in funding_duration column

#company_investment_acquisition_rounds['funding_duration_year']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[Y]') # use this when you want to see Year in as 0.0 when month is less than 12 becuase if less than 12 then year get displayed as 0.0
X_train['funding_duration_year']=X_train['funding_duration'].dt.days/364.0   # use this when you want to see year as 0.6 for months less than 12 

#company_investment_acquisition_rounds['funding_duration_month']=company_investment_acquisition_rounds['funding_duration'].astype('timedelta64[M]') # use this when you want to see 0.0 for month when days <30
#company_investment_acquisition_rounds['funding_duration_month']=(company_investment_acquisition_rounds['funding_duration_year'])*364.0/30.0   # use this to see 0.034 as month when days <30
X_train['funding_duration_month']=(X_train['funding_duration_days'])/30.0   

In [None]:
t1=pd.to_datetime(X_test['first_funding_at'], errors = 'coerce')

#t2 = pd.to_datetime(combined_company_investment_acquisition_rounds['last_funding_at'])
t2=pd.to_datetime(X_test['last_funding_at'], errors = 'coerce')

X_test['funding_duration']=t2-t1
X_test['funding_duration_days']=X_test['funding_duration'].dt.days  ## This takes care of the duratrion that will remove "days"

#['funding_duration_year']=['funding_duration'].astype('timedelta64[Y]') # 
X_test['funding_duration_year']=X_test['funding_duration'].dt.days/364.0   # use 

X_test['funding_duration_month']=(X_test['funding_duration_days'])/30.0   

In [None]:
p1 = pd.DatetimeIndex(X_train['first_funding_at']).year
p2 = pd.DatetimeIndex(X_train['last_funding_at']).year
X_train['Avg_duration_of_funding']=p2-p1
temp = X_train['Avg_duration_of_funding'] /2
X_train['Avg_funding_in_year'] = temp

#df['funding_duration'] = df['first_funding_at'] - df['last_funding_at']
#df['first_funding_at_UTC'] = t1.astype(int)
#df['last_funding_at_UTC'] = t2.astype(int)
#filter_data.info()
X_train.head()

In [None]:
p1 = pd.DatetimeIndex(X_test['first_funding_at']).year
p2 = pd.DatetimeIndex(X_test['last_funding_at']).year
X_test['Avg_duration_of_funding']=p2-p1
temp = X_test['Avg_duration_of_funding'] /2
X_test['Avg_funding_in_year'] = temp

#df['funding_duration'] = df['first_funding_at'] - df['last_funding_at']
#df['first_funding_at_UTC'] = t1.astype(int)
#df['last_funding_at_UTC'] = t2.astype(int)
#filter_data.info()
#X_train.head()

In [None]:
X_train[["founded_at_year", "founded_at_month", "founded_at_day"]] = X_train["founded_at"].str.split("-", expand = True)
#print("\nNew DataFrame:")
X_train[["first_funding_year", "first_funding_month", "first_funding_day"]] = X_train["first_funding_at"].str.split("-", expand = True)

X_train[["last_funding_year", "last_funding_month", "last_funding_day"]] = X_train["last_funding_at"].str.split("-", expand = True)

X_train[["funded_year", "funded_month", "funded_day"]] = X_train["funded_at_y"].str.split("-", expand = True)

X_train.head()

In [None]:
X_test[["founded_at_year", "founded_at_month", "founded_at_day"]] = X_test["founded_at"].str.split("-", expand = True)
#print("\nNew DataFrame:")
X_test[["first_funding_year", "first_funding_month", "first_funding_day"]] = X_test["first_funding_at"].str.split("-", expand = True)

X_test[["last_funding_year", "last_funding_month", "last_funding_day"]] = X_test["last_funding_at"].str.split("-", expand = True)

X_test[["funded_year", "funded_month", "funded_day"]] = X_test["funded_at_y"].str.split("-", expand = True)

#X_train.head()

In [None]:
X_train['founded_at_day'] = pd.to_numeric(X_train['founded_at_day'], errors='coerce')
X_train['founded_at_month'] = pd.to_numeric(X_train['founded_at_month'], errors='coerce')
X_train['founded_at_year'] = pd.to_numeric(X_train['founded_at_year'], errors='coerce')


X_train['first_funding_day'] = pd.to_numeric(X_train['first_funding_day'], errors='coerce')
X_train['first_funding_month'] = pd.to_numeric(X_train['first_funding_month'], errors='coerce')
X_train['first_funding_year'] = pd.to_numeric(X_train['first_funding_year'], errors='coerce')


X_train['last_funding_day'] = pd.to_numeric(X_train['last_funding_day'], errors='coerce')
X_train['last_funding_month'] = pd.to_numeric(X_train['last_funding_month'], errors='coerce')
X_train['last_funding_year'] = pd.to_numeric(X_train['last_funding_year'], errors='coerce')

X_train['funded_year'] = pd.to_numeric(X_train['funded_year'], errors='coerce')
X_train['funded_month'] = pd.to_numeric(X_train['funded_month'], errors='coerce')
X_train['funded_day'] = pd.to_numeric(X_train['funded_day'], errors='coerce')

In [None]:
X_test['founded_at_day'] = pd.to_numeric(X_test['founded_at_day'], errors='coerce')
X_test['founded_at_month'] = pd.to_numeric(X_test['founded_at_month'], errors='coerce')
X_test['founded_at_year'] = pd.to_numeric(X_test['founded_at_year'], errors='coerce')


X_test['first_funding_day'] = pd.to_numeric(X_test['first_funding_day'], errors='coerce')
X_test['first_funding_month'] = pd.to_numeric(X_test['first_funding_month'], errors='coerce')
X_test['first_funding_year'] = pd.to_numeric(X_test['first_funding_year'], errors='coerce')


X_test['last_funding_day'] = pd.to_numeric(X_test['last_funding_day'], errors='coerce')
X_test['last_funding_month'] = pd.to_numeric(X_test['last_funding_month'], errors='coerce')
X_test['last_funding_year'] = pd.to_numeric(X_test['last_funding_year'], errors='coerce')

X_test['funded_year'] = pd.to_numeric(X_test['funded_year'], errors='coerce')
X_test['funded_month'] = pd.to_numeric(X_test['funded_month'], errors='coerce')
X_test['funded_day'] = pd.to_numeric(X_test['funded_day'], errors='coerce')

In [None]:
from datetime import date

today = date.today()
a1 =X_train['founded_at_year']

X_train['Age of Company'] = today.year - a1

X_train.head()

In [None]:
today = date.today()
a1 =X_test['founded_at_year']

X_test['Age of Company'] = today.year - a1

In [None]:
X_train['raised_amount_k$'] = X_train['funding_total_usd']/1000
X_train['raised_amount_m$'] = X_train['raised_amount_usd_y']/1000000
X_train['raised_amount_b$'] = X_train['raised_amount_usd_y']/1000000000
X_train.info()
X_train.head()

In [None]:
X_test['raised_amount_k$'] = X_test['funding_total_usd']/1000
X_test['raised_amount_m$'] = X_test['raised_amount_usd_y']/1000000
X_test['raised_amount_b$'] = X_test['raised_amount_usd_y']/1000000000

In [None]:
X_train[X_train['investor_name'].isnull()]


In [None]:
from datetime import datetime

todaydate=datetime.today().date().strftime('%Y-%m-%d')
X_train['todaydate_conv'] = pd.to_datetime(todaydate)

X_train['founded_at_conv'] = pd.to_datetime(X_train["founded_at"], errors='coerce')

X_train['Age_of_company_month'] = (X_train['todaydate_conv'] - X_train['founded_at_conv'])/np.timedelta64(1, 'M')

X_train.isnull().sum()  #since there is only one blank value in Age_of_company_month i will do the below to conver to int


# create labels temporary to add new features based on label

In [None]:
combine = pd.concat([X_train, X_test], axis=0).reset_index(drop=True)

In [None]:
X_train['label'] = 0
X_train.loc[X_train.status == 'ipo', 'label'] = 1 # add 1 to the label column with status ipo or acquired
X_train.loc[X_train.status == 'acquired', 'label'] = 1

X_train.info()

In [None]:
X_test['label'] = 0
X_test.loc[X_test.status == 'ipo', 'label'] = 1 # add 1 to the label column with status ipo or acquired
X_test.loc[X_test.status == 'acquired', 'label'] = 1

#X_test.info()

In [None]:
category_split = X_train["category_list"].str.split("|", n = 1, expand = True)

#category_split = X_test["category_list"].str.split("|", n = 1, expand = True)
#print(category_split)
#print(category_split)
X_train["category_1"] = category_split[0]

#X_test["category_1"] = category_split[0]

X_train.head()
X_train.info()

In [None]:
category_split1 = X_test["category_list"].str.split("|", n = 1, expand = True)
X_test["category_1"] = category_split1[0]

In [None]:
X_train = X_train.rename(columns={'Domain': 'company domain', 'funding_round_code_x': 'funding_round_code',
                                    'funding_round_type_y': 'funding_round_type','raised_amount_usd_y': 'raised_amount_usd'})

In [None]:
X_test = X_test.rename(columns={'Domain': 'company domain', 'funding_round_code_x': 'funding_round_code',
                                    'funding_round_type_y': 'funding_round_type','raised_amount_usd_y': 'raised_amount_usd'})

In [None]:
X_train.head()

In [None]:
X_train['past_year'] = X_train['founded_at_year']-3

In [None]:
X_train.head(30)


### EDA on X_train

In [None]:
import seaborn as sns

plt.xlim(0, 300)
sns.distplot(X_train['Age_of_company_month'], hist=True)

In [None]:
import seaborn as sns

plt.xlim(-1000, 4000)
sns.distplot(X_train['funding_duration_days'], hist=True)

In [None]:
variable=["first_funding_year","last_funding_year","founded_at_year"]

plt.figure(figsize=(17,3),dpi=100)
for i in range(len(variable)):
    plt.subplot(1,3,i+1)
    plt.title("{}". format(variable[i]))
    sns.distplot(X_train[variable[i]], color="orange");

In [None]:
# Figure size
plt.figure(figsize=(6,6))

# Pie plot
X_train['label'].value_counts().plot.pie(explode=[0.1,0.1], autopct='%1.1f%%', shadow=True, textprops={'fontsize':16}).set_title("Status Distribution")

In [None]:
# Figure size
plt.figure(figsize=(10,4))
plt.xlim(0, 800)
# Histogram
sns.histplot(data=X_train, x='Age_of_company_month', hue='label', binwidth=10, kde=True)

# Aesthetics
plt.title('Age distribution')
plt.xlabel('Age (in months)')

In [None]:
#Age of Company
# Figure size
plt.figure(figsize=(10,4))
# plt.xlim(0, 800)
# Histogram
sns.histplot(data=X_train, x='Age of Company', hue='label', binwidth=10, kde=True)

# Aesthetics
plt.title('Age distribution')
plt.xlabel('Age (in years)')

In [None]:
plt.figure(figsize=(20,3),dpi=100)
plt.subplot(1,3,1)
sns.scatterplot(X_train["first_funding_year"],X_train["last_funding_year"], label="first&last funding", palette="Greens")
#sns.scatterplot(df["age_first_milestone_year"], df["age_last_milestone_year"], label="first&last milestone", palette="Blues")
plt.legend()

plt.subplot(1,3,2)
plt.xlim(1900, 2100)
sns.distplot(X_train["first_funding_year"], label="first_funding")
sns.distplot(X_train["last_funding_year"], label="last_funding")
sns.distplot(X_train["founded_at_year"], label="first_milestone")
#sns.distplot(df["age_last_milestone_year"], label="last_milestone")
plt.xlabel("first_funding, last_funding, first_milestone, last_milestone")
plt.legend()


plt.show()

In [None]:
!pip install plotly


In [None]:
import seaborn as sns
import plotly.graph_objs as go
# visualization styling
plt.style.use('ggplot')
choropleth_map = go.Figure(
    data = {
        'type':'choropleth',
        'locationmode':'USA-states',        
        'locations':list(X_train.state_code.value_counts().index),
        'colorscale':'Reds',            
        'z':list(X_train.state_code.value_counts().values),   
#         'title': 'State Code'÷
        },     
    layout = {
      'geo':{
          'scope':'usa'
      }  
    })
choropleth_map.update_layout(
    title_text = '1. State Code',
)
choropleth_map

In [None]:
plt.rcParams['mathtext.fontset'] = 'stix'
plt.rcParams['font.serif'] = ['Times New Roman']
plt.rcParams["figure.figsize"] = (14,5) 
plt.rcParams["figure.dpi"] = 130  

plt.title("8. category_code",{'fontsize':16,
        'fontweight':"bold"})

category = X_train['category_1'].value_counts()

plt.xticks(rotation=90)
sns.barplot(x=category.index,y=category.values,palette = 'ocean')

In [None]:
X_train.to_csv("X_train_datafile_generation_new_features_08.28.22.csv")

In [None]:
X_test.to_csv("X_test_datafile_generation_new_features_08.28.22.csv")

In [None]:
X_train1 = pd.read_csv("x_train_past_current_future_11.25.22.csv")

In [None]:
X_test1 = pd.read_csv("x_test_with_current_future_past_11.15.22.csv")

In [None]:
X_train1.info()

In [None]:
X_test1.info()

In [None]:
X_train1.rename(columns = {'fundind_total_k':'fundind_total_k$','funding_total_m': 'funding_total_m$','funding_total_b':'funding_total_b$','Sum_RaisedAmt_Label1':'sum_investor_raisedAmt_sucess',
                             'Sum_RaisedAmt_Label0': 'sum_investor_raisedAmt_failure',
                             'TotalLabel1':'no_of_successful_comp','TotalLabel0':'no_of_failed_comp','TotalRecords':'total_companies',
                            'SuccessRate_by_company' : 'percentage_of_success_rate_by_comp','FailureRate': 'percentage_of_failure_rate_by_comp'
                            ,'SuccessByAmount':'percentage_of_success_rate_by_USD','raised_amount_k':'raised_amount_k$','raised_amount_m':'raised_amount_m$','raised_amount_b':'raised_amount_b$'}, inplace = True)

In [None]:
X_test1.rename(columns = {'fundind_total_k':'fundind_total_k$','funding_total_m': 'funding_total_m$','funding_total_b':'funding_total_b$','Sum_RaisedAmt_Label1':'sum_investor_raisedAmt_sucess',
                             'Sum_RaisedAmt_Label0': 'sum_investor_raisedAmt_failure',
                             'TotalLabel1':'no_of_successful_comp','TotalLabel0':'no_of_failed_comp','TotalRecords':'total_companies',
                            'SuccessRate_by_company' : 'percentage_of_success_rate_by_comp','FailureRate': 'percentage_of_failure_rate_by_comp'
                            ,'SuccessByAmount':'percentage_of_success_rate_by_USD','raised_amount_k':'raised_amount_k$','raised_amount_m':'raised_amount_m$','raised_amount_b':'raised_amount_b$'}, inplace = True)

In [None]:
X_train1.info()

In [None]:
X_train_new = X_train1.drop(columns=['name', 'homepage_url', 'status', 'state_code',
                      'funded_at_y','first_funding_at','last_funding_at','funding_duration',
                                 'investor_country_code','state_code','region','city','FlagForCat1_Past',
                                  'FlagForCat1_Current','FlagForCat1_Future','CountofCompanies','category_list',
                             'investor_permalink','founded_at','Previous_2Year','Next_2Year','Past_3Year','Future_2Year','StartYear',
                                    'Current_CountOfTop15','Future_CountOfTop15','Past_CountOfTop15'])

X_train_new.info()  #10668

In [None]:
X_test_new = X_test1.drop(columns=['name', 'homepage_url', 'status', 'state_code',
                      'funded_at_y','first_funding_at','last_funding_at','funding_duration',
                                 'investor_country_code','state_code','region','city','FlagForCat1_Past',
                                  'FlagForCat1_Current','FlagForCat1_Future','CountofCompanies','category_list',
                             'investor_permalink','founded_at','Previous_2Year','Next_2Year','Past_3Year','Future_2Year','EndYear',
                                    'Current_CountOfTop15','Future_CountOfTop15','Past_CountOfTop15'])

X_test_new.info()  #2666

## Creating past current and future count of categories

In [None]:
## Past count of categories
sp = X_train_new.shape[0]
# print(file.iloc[file.shape[0]-1,:])
arr=[]
for i in range(sp):
    ans = X_train_new[(X_train_new.category_1 == X_train_new.iloc[i,:].category_1) & (X_train_new.founded_at_year <= (X_train_new.iloc[i,:].founded_at_year-3))]
    #print(ans.shape[0])
    arr.append(ans.shape[0])
#print(arr)

In [None]:
len(arr)


In [None]:
df = pd.DataFrame(arr, columns =['past_count'])
df

In [None]:
df.reset_index(drop=True)
X_train_new.reset_index(drop=True)

In [None]:
X_train_new.index = df.index
#df2 =pd.concat([X_train, df], axis = 1)
train_set = pd.concat([X_train_new.reset_index(drop=True), df.reset_index(drop=True)], axis=1)

In [None]:
train_set

## Create Future Count of category for train set

In [None]:
## Future count of categories
sf1 = train_set.shape[0]
# print(file.iloc[file.shape[0]-1,:])
arr2=[]
for i in range(sf1):
    future1 = train_set[(train_set.category_1 == train_set.iloc[i,:].category_1) & (train_set.founded_at_year >= (train_set.iloc[i,:].founded_at_year+3))]
    #print(ans.shape[0])
    arr2.append(future1.shape[0])
#print(arr2)

In [None]:
len(arr2)

In [None]:
train_df_future = pd.DataFrame(arr2, columns =['future_count'])
train_df_future

In [None]:
pd.set_option('display.max_columns', None)
train_df_future.reset_index(drop=True)
train_set.reset_index(drop=True)

In [None]:
pd.set_option('display.max_columns', None)
train_df_future.index = train_set.index
#df2 =pd.concat([X_train, df], axis = 1)
train_past_future_current = pd.concat([train_set.reset_index(drop=True), train_df_future.reset_index(drop=True)], axis=1)
train_past_future_current

## Past and Future test count

In [None]:
## Future count of categories
sf1 = X_test_new.shape[0]
# print(file.iloc[file.shape[0]-1,:])
arr3=[]
for i in range(sf1):
    future = X_test_new[(X_test_new.category_1 == X_test_new.iloc[i,:].category_1) & (X_test_new.founded_at_year >= (X_test_new.iloc[i,:].founded_at_year+3))]
    #print(ans.shape[0])
    arr3.append(future.shape[0])
#print(arr1)

In [None]:
len(arr3)

In [None]:
test_df = pd.DataFrame(arr3, columns =['future_count'])
test_df

In [None]:
pd.set_option('display.max_columns', None)
test_df.reset_index(drop=True)
X_test_new.reset_index(drop=True)

In [None]:
X_test_new.index = test_df.index
#df2 =pd.concat([X_train, df], axis = 1)
test_set = pd.concat([X_test_new.reset_index(drop=True), test_df.reset_index(drop=True)], axis=1)

In [None]:
pd.set_option('display.max_columns', None)
test_set

## Past count for test set

In [None]:
## past count of categories
spast = test_set.shape[0]
# print(file.iloc[file.shape[0]-1,:])
arr_p=[]
for i in range(spast):
    past = test_set[(test_set.category_1 == test_set.iloc[i,:].category_1) & (test_set.founded_at_year <= (test_set.iloc[i,:].founded_at_year-3))]
    #print(ans.shape[0])
    arr_p.append(past.shape[0])
#print(arr_p)

In [None]:
len(arr_p)

In [None]:
test_past = pd.DataFrame(arr_p, columns =['past_count'])
test_past

In [None]:
pd.set_option('display.max_columns', None)
test_past.reset_index(drop=True)
test_set.reset_index(drop=True)

In [None]:
test_past.index = test_set.index
#df2 =pd.concat([X_train, df], axis = 1)
test_past_future_current = pd.concat([test_set.reset_index(drop=True), test_past.reset_index(drop=True)], axis=1)
test_past_future_current

In [None]:
train_past_future_current['country_code'] = train_past_future_current['country_code'].astype('category')

others = train_past_future_current['country_code'].value_counts().index[6:]
label1 = 'other countries'

train_past_future_current['country_code'] = train_past_future_current['country_code'].cat.add_categories([label1])
train_past_future_current['country_code'] = train_past_future_current['country_code'].replace(others, label1)



In [None]:
test_past_future_current['country_code'] = test_past_future_current['country_code'].astype('category')
#X_test_new['country_code'] = X_test_new['country_code'].cat.add_categories([label1])
test_past_future_current['country_code'] = train_past_future_current['country_code'].replace(others, label1)


In [None]:
 train_past_future_current.country_code.unique()

In [None]:
 test_past_future_current.country_code.unique()

In [None]:
#X_test['country_code'] = X_train['country_code'].cat.add_categories([label1])
#X_test['country_code'] = X_train['country_code'].replace(others, label1)

In [None]:
#X_test_new.country_code.unique()

#  Keep the Top 4 domain and put the rest in Others

In [None]:
train_past_future_current['company_domain'] = train_past_future_current['company_domain'].astype('category')

others1 = train_past_future_current['company_domain'].value_counts().index[4:]
label3 = 'Others'

train_past_future_current['company_domain'] = train_past_future_current['company_domain'].cat.add_categories([label3])
train_past_future_current['company_domain'] = train_past_future_current['company_domain'].replace(others1, label3)

In [None]:
test_past_future_current['company_domain'] = test_past_future_current['company_domain'].astype('category')
#X_test_new['country_code'] = X_test_new['country_code'].cat.add_categories([label1])
test_past_future_current['company_domain'] = train_past_future_current['company_domain'].replace(others, label1)

In [None]:
train_past_future_current.company_domain.unique()

In [None]:
test_past_future_current.company_domain.unique()

## Investor Names information ****************

## Get most occured investor names and convert them to one hot encoded values

In [None]:
null_cells = train_past_future_current["investor_name"].isnull()
train_past_future_current["investor_name"] = train_past_future_current["investor_name"].astype(str).mask(null_cells, np.NaN)

In [None]:
train_past_future_current["investor_name"].fillna('no_investor', inplace=True)

In [None]:
most_common_investor = Counter(" ".join(train_past_future_current["investor_name"].str.lower()).split()).most_common(25)
most_common_investor

## create a list of most common investor names and add the rest to 'others'

In [None]:
toxx = ['Ventures','Capital','Partners','Fund','Venture','Group','Investment','Management','Technology','Equity','Angel','Startups','Bank']

In [None]:
for i in range(0,10668):
    if any(ext in train_past_future_current['investor_name'][i] for ext in toxx):
        train_past_future_current['investor_name'][i] = train_past_future_current['investor_name'][i]
    else:
        train_past_future_current['investor_name'][i] = 'other_investors'

In [None]:
null_cells = train_past_future_current["investor_name"].isnull()
train_past_future_current["investor_name"] = train_past_future_current["investor_name"].astype(str).mask(null_cells, np.NaN)

In [None]:
test_past_future_current["investor_name"].fillna('no_investor', inplace=True)

In [None]:
for i in range(0,2666):
    if any(ext in test_past_future_current['investor_name'][i] for ext in toxx):
        test_past_future_current['investor_name'][i] = test_past_future_current['investor_name'][i]
    else:
        test_past_future_current['investor_name'][i] = 'other_investors'

In [None]:
train_past_future_current['investor_name'].head(45)

In [None]:
train_past_future_current['investor_name'].unique()

In [None]:
train_past_future_current.info()

In [None]:
test_past_future_current.info()

## one hot encoded investor names

In [None]:

# ############################# to convert most common investor names into one hot encoded columns ########################



toxic = ['ventures','capital','partners','fund','venture','group','investment','management','technology','equity','angel','startups','bank','other_investors']

#df = pd.DataFrame({'text':['You look horrible','You are good','you are bad and disguisting']})

train = pd.concat([train_past_future_current,pd.DataFrame(columns=toxic)],ignore_index=True, sort=False)
train.replace(np.nan, 0.0)

samp_col = train['investor_name'].str.lower()

samp = samp_col.str.split().apply(lambda x : [i for i in toxic if i in x])

for i,j in enumerate(samp):
    for k in j:
        train.loc[i,k] = 1.0


train.head(40)

# main[['ventures','capital','partners','fund','venture','group','investment','management','technology','equity','angel','startups','bank','other_investors']] = main[['ventures','capital','partners','fund','venture','group','investment','management','technology','equity','angel','startups','bank','other_investors']].cat.add_categories('Null')
# main['categorical_column'].fillna('Null', inplace=True)

train.replace(np.nan, 0.0, inplace = True)

In [None]:
# pd. set_option('display.max_columns', 500)
# train.head(50)

In [None]:
train.info()

## test set one hot encoded investor names

In [None]:
#toxic = ['ventures','capital','partners','fund','venture','group','investment','management','technology','equity','angel','startups','bank','other_investors']



test = pd.concat([test_past_future_current,pd.DataFrame(columns=toxic)],ignore_index=True, sort=False)
test.replace(np.nan, 0.0)

samp_cols = test['investor_name'].str.lower()

samps = samp_cols.str.split().apply(lambda x : [i for i in toxic if i in x])

for i,j in enumerate(samps):
    for k in j:
        test.loc[i,k] = 1.0

test.replace(np.nan, 0.0, inplace = True)

In [None]:
test.info()

# tokenize category to fit into the model

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
#from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics 
from sklearn import ensemble
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, auc, f1_score
from scipy import interp
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
%matplotlib inline 

# X = main.drop(columns = ['label'])
# y = main['label']


# tokenize category and count occurances using count vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words ='english',analyzer='word')
#cv = CountVectorizer(parameters desired)

train_1 = vectorizer.fit_transform(train['category_1'])
train_x = pd.DataFrame(train_1.toarray(), columns = vectorizer.get_feature_names()) 
test_1 = vectorizer.transform(test['category_1'])
test_x = pd.DataFrame(test_1.toarray(), columns = vectorizer.get_feature_names()) 

In [None]:
train_x.info()

In [None]:
test_x.info()

In [None]:
train.reset_index(drop=True, inplace=True)
res = pd.concat([train, train_x], axis = 1)
res.head()
X_train_res =res.drop(columns=['permalink','investor_name','category_1'])

In [None]:
X_train_res.head()

In [None]:
test.reset_index(drop=True, inplace=True)

reset = pd.concat([test, test_x], axis = 1)
#res.head()
X_test_res =reset.drop(columns=['permalink','investor_name','category_1','label'])

In [None]:
X_test_res.info()

In [None]:
X_train_res.info()

In [None]:
y=X_train_res['label'].copy().astype(int) # y is label
X=X_train_res.drop('label', axis=1).copy() # now X is new X_train, X_test is as it is (X_test)

## One hot Encode Country_code, company_domain_funding_round_code, funding round type

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
encoder=OneHotEncoder(sparse=False)



train_X_encoded = pd.DataFrame (encoder.fit_transform(X[['country_code','company_domain','funding_round_code','funding_round_type']]))
train_X_encoded.columns = encoder.get_feature_names(['country_code','company_domain','funding_round_code','funding_round_type'])

test_X_encoded = pd.DataFrame (encoder.transform(X_test_res[['country_code','company_domain','funding_round_code','funding_round_type']]))

test_X_encoded.columns = encoder.get_feature_names(['country_code','company_domain','funding_round_code','funding_round_type'])

X_test_res.drop(['country_code','company_domain','funding_round_code','funding_round_type'] ,axis=1, inplace=True)


OH_X_test = pd.concat([X_test_res, test_X_encoded ], axis=1)
X.drop(['country_code','company_domain','funding_round_code','funding_round_type'] ,axis=1, inplace=True)
OH_X_train= pd.concat([X, train_X_encoded ], axis=1)

In [None]:
OH_X_test.info()

In [None]:
OH_X_train.info()

In [None]:
X = OH_X_train

In [None]:
X_test = OH_X_test

In [None]:
X = X.loc[:,~X.columns.duplicated()].copy()

In [None]:
X_test = X_test.loc[:,~X_test.columns.duplicated()].copy()

In [None]:
combine_X = pd.concat([X, X_test], axis=0).reset_index(drop=True)
#df = df.loc[~df.index.duplicated(keep='first')]

In [None]:
combine_X.info()

In [None]:
combine_y = pd.concat([y, y_test], axis=0).reset_index(drop=True)

In [None]:
X = combine_X
y = [combine_y]

## Sklearn model selection

In [None]:
# Sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score
from sklearn.metrics import roc_auc_score, plot_confusion_matrix, plot_roc_curve, roc_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
##import eli5
#from eli5.sklearn import PermutationImportance
from sklearn.utils import resample

# Models
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier

## Train test split 

In [None]:
# Train-validation split
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,train_size=0.8,test_size=0.2,random_state=0)

## Prepare configuration for cross validation test harness models

In [None]:
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LogisticRegression(random_state = 42, max_iter = 10**6)))
#models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier(n_neighbors = 91)))
models.append(('DT', DecisionTreeClassifier(random_state=42)))
#models.append(('NB', GaussianNB()))
models.append(('XGBoost', XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')))
models.append(('AdaBoost', AdaBoostClassifier(n_estimators=100, random_state=0)))
models.append(('RFC-100',RandomForestClassifier(n_estimators = 100,random_state=521)))
models.append(('RFC-200',RandomForestClassifier(n_estimators = 200,random_state=521)))
models.append
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
    cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
plt.grid()
ax.set_xticklabels(names)
plt.show()

In [None]:
fig.savefig('Algorithm_comparison.png')

## Method 1 for model testing - using confusion matrix, accuracy and ROC curve

## Another method 2 for modeling and building prediction 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics 
from sklearn import ensemble
import matplotlib.pyplot as plt
import seaborn as sns
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, roc_auc_score, auc, f1_score
from scipy import interp
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
%matplotlib inline 

In [None]:
############################LR model ##########################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_1=[]

#Notes
#The first n_samples(11343) % n_splits(5) folds have size n_samples // n_splits + 1
#, other folds have size n_samples // n_splits, where n_samples is the number of samples.

#cm = confusion_matrix(y_test, y1_pred)
#sns.heatmap(cm, annot = True)
#plt.show()

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model_1 = LogisticRegression(random_state=42, max_iter= 10**4)
    model_1.fit(X_train, y_train)
    y1_pred = model_1.predict(X_test)
    accuracy_model_1.append(accuracy_score(y_test, y1_pred))
print(accuracy_model_1)
#cm = confusion_matrix(y_test, y1_pred)
#sns.heatmap(cm, annot = True)
#plt.show()
print ("Avg accuracy for LR ", np.array(accuracy_model_1).mean())
cm = confusion_matrix(y_test, y1_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,y1_pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y1_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("roc_auc",roc_auc)
 

In [None]:
############################ RF model 100 ##########################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_2=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    #model_2 = sklearn.ensemble.RandomForestClassifier(bootstrap=False, max_depth=12, min_samples_leaf=100, min_samples_split=20,
                      # n_estimators=100,criterion='entropy',n_jobs=1000,max_leaf_nodes=100,max_features='auto', random_state = 521)
    model_2 = RandomForestClassifier(n_estimators=100, random_state=521)
    model_2.fit(X_train, y_train)
    y2_pred = model_2.predict(X_test)
    accuracy_model_2.append(accuracy_score(y_test, y2_pred))

print(accuracy_model_2)
cm = confusion_matrix(y_test, y2_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()    
 
print(classification_report(y_test,y2_pred))
print ("Avg accuracy for RF ", np.array(accuracy_model_2).mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y2_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("roc_auc",roc_auc)

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
accuracy_model_ft=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    #model_2 = sklearn.ensemble.RandomForestClassifier(bootstrap=False, max_depth=12, min_samples_leaf=100, min_samples_split=20,
                      # n_estimators=100,criterion='entropy',n_jobs=1000,max_leaf_nodes=100,max_features='auto', random_state = 521)
    model_ft = RandomForestClassifier(n_estimators=200, random_state=521)
    model_ft.fit(X_train, y_train)
#     y2_pred = model_ft.predict(X_test)
#     accuracy_model_ft.append(accuracy_score(y_test, y2_pred))
    
    

# Feature Importance Score

In [None]:
import numpy as np
 
importances = model_ft.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]
 
feat_labels = combine_X.columns[1:]
 
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30,
                            feat_labels[sorted_indices[f]],
                            importances[sorted_indices[f]]))

In [None]:
importances = pd.DataFrame(data={
    'Attribute': combine_X.columns,
    'Importance': model_ft.feature_importances_
})
importances = importances.sort_values(by='Importance', ascending=False)
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from coefficients', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
feat_importances = pd.Series(model_ft.feature_importances_, index=combine_X.columns)
feat_importances.nlargest(55) #.plot(kind='barh')
feature_n = feat_importances.nlargest(40)

In [None]:
df_feat =feature_n.to_frame()

In [None]:
df_feat.index

# Random Forest Tree

In [None]:
model_tree = RandomForestClassifier(max_depth = 3, n_estimators=10, random_state=521)
model_tree.fit(X_train, y_train)

In [None]:
len(model_tree.estimators_)

In [None]:
plt.figure(figsize=(20,10))
_ = tree.plot_tree(model_tree.estimators_[0], filled=True,fontsize=11)

In [None]:
from sklearn.metrics import f1_score, make_scorer
# Custom scorer for cross validation
scorer = make_scorer(f1_score, greater_is_better=True, average = 'macro')

In [None]:
from sklearn.feature_selection import RFECV

# Create a model for feature selection
estimator = RandomForestClassifier(random_state = 10, n_estimators = 100, max_depth = 3,  n_jobs = -1)

# Create the object
selector = RFECV(estimator, step = 1, cv = 3, scoring= scorer, n_jobs = -1)

In [None]:
selector.fit(X_train, y_train)

In [None]:
rankings = pd.DataFrame({'feature': list(X_train.columns), 'rank': list(selector.ranking_)}).sort_values('rank')
rankings.head(10)

In [None]:
train_selected = selector.transform(X_train)
test_selected = selector.transform(X_test)

In [None]:
selected_features = X_train.columns[np.where(selector.ranking_==1)]
train_selected = pd.DataFrame(train_selected, columns = selected_features)
test_selected = pd.DataFrame(test_selected, columns = selected_features)

In [None]:
selected_features

In [None]:
#combine_X.columns[300:]

In [None]:
import pandas as pd

plotdata = pd.DataFrame({

    "Investment":[50,32,36],

    "Business":[30,48,46],

    "Market":[20,20,18]},
    
    index=["TOP 10", "TOP 25", "TOP 50"])

plotdata.plot(kind="bar",figsize=(14, 7),fontsize = 13)


plt.title("Top Feature Category",fontsize = 14)

#plt.xlabel("Feature Selection")

plt.ylabel("Percentage")

In [None]:
############################ RF model 200 trees ##########################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_2=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    #model_2 = sklearn.ensemble.RandomForestClassifier(bootstrap=False, max_depth=12, min_samples_leaf=100, min_samples_split=20,
                      # n_estimators=100,criterion='entropy',n_jobs=1000,max_leaf_nodes=100,max_features='auto', random_state = 521)
    model_2 = RandomForestClassifier(n_estimators=200, random_state=521)
    model_2.fit(X_train, y_train)
    y2_pred = model_2.predict(X_test)
    accuracy_model_2.append(accuracy_score(y_test, y2_pred))

print(accuracy_model_2)
cm = confusion_matrix(y_test, y2_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()    
 
print(classification_report(y_test,y2_pred))
print ("Avg accuracy for RF ", np.array(accuracy_model_2).mean())
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y2_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("roc_auc",roc_auc)

In [None]:
################################################# KNN model #################################################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_3=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model_3 = KNeighborsClassifier(n_neighbors=91)
    model_3.fit(X_train, y_train)
    y3_pred = model_3.predict(X_test)
    accuracy_model_3.append(accuracy_score(y_test, y3_pred))
print(accuracy_model_3) 
print ("Avg accuracy for KNN ", np.array(accuracy_model_3).mean())
cm = confusion_matrix(y_test, y3_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()
print(classification_report(y_test,y3_pred))
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y3_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
print("roc_auc",roc_auc)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_4=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model_4 = AdaBoostClassifier(random_state=42)
    model_4.fit(X_train, y_train)
    y4_pred = model_4.predict(X_test)
    accuracy_model_4.append(accuracy_score(y_test, y4_pred))
print(accuracy_model_4) 

print(classification_report(y_test,y4_pred))
print ("Avg accuracy for AdaBoost ", np.array(accuracy_model_4).mean())
cm = confusion_matrix(y_test, y4_pred)
sns.heatmap(cm, annot = True, fmt = 'g')

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y4_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
#print("roc_auc",roc_auc)
print("roc_auc",roc_auc)
plt.show()

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_5=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model_5 = xgb.XGBClassifier(use_label_encoder =False, eval_metric='mlogloss')
    model_5.fit(X_train, y_train)
    y5_pred = model_5.predict(X_test)
    accuracy_model_5.append(accuracy_score(y_test, y5_pred))
print(accuracy_model_5) 

print(classification_report(y_test,y5_pred))
print ("Avg accuracy for XGBoost ", np.array(accuracy_model_5).mean())
cm = confusion_matrix(y_test, y5_pred)
sns.heatmap(cm, annot = True, fmt = 'g')

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y5_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
#print("roc_auc",roc_auc)
print("roc_auc",roc_auc)
plt.show()

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
accuracy_model_5=[]

for train_index, test_index in kf.split(X):
   # print("Train:", train_index, "Validation:",test_index)
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    
    model_5 = DecisionTreeClassifier( random_state = 42)
    model_5.fit(X_train, y_train)
    y5_pred = model_5.predict(X_test)
    accuracy_model_5.append(accuracy_score(y_test, y5_pred))
print(accuracy_model_5) 

print(classification_report(y_test,y5_pred))
print ("Avg accuracy for Decision tree ", np.array(accuracy_model_5).mean())
cm = confusion_matrix(y_test, y5_pred)
sns.heatmap(cm, annot = True, fmt = 'g')

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test,y5_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
#print("roc_auc",roc_auc)
print("roc_auc",roc_auc)
plt.show()

## Approach with same models and Roc plots for 5 fold and 5 repeats

In [None]:
import matplotlib.patches as patches

In [None]:
#DecisionTreeClassifier(random_state=42)

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_d = []
model_d = DecisionTreeClassifier(random_state=42)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, ye_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_d)

print ("Avg accuracy for Decision Tree :  ", np.array(accuracy_model_d).mean())
cm = confusion_matrix(y_test, ye_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ye_pred))

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_r = []
model_r = RandomForestClassifier(max_depth=200,random_state=42)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_r.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_r.predict(X_test)
    accuracy_model_r.append(accuracy_score(y_test, ye_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_r)

print ("Avg accuracy for Random Forest 200:  ", np.array(accuracy_model_r).mean())
cm = confusion_matrix(y_test, ye_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ye_pred))

In [None]:
estimator = model_r.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = iris.feature_names,
                class_names = iris.target_names,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png
from subprocess import call
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])

In [None]:
tree.plot_tree(ye_pred);

In [None]:
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_e = []
model_e = xgb.XGBClassifier(n_estimators = 200, eval_metric='error', use_label_encoder=False)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_e.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_e.predict(X_test)
    accuracy_model_e.append(accuracy_score(y_test, ye_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_e)

print ("Avg accuracy for XGBoost :  ", np.array(accuracy_model_e).mean())
cm = confusion_matrix(y_test, ye_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ye_pred))

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_d = []
model_d = AdaBoostClassifier(random_state=42)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yd_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, yd_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_d)

print ("Avg accuracy for Adaboost ", np.array(accuracy_model_d).mean())
cm = confusion_matrix(y_test, yd_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yd_pred))

In [None]:
############################ KNN model ##########################################

from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_c = []
model_c = KNeighborsClassifier(n_neighbors=91)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_c.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yc_pred = model_c.predict(X_test)
    accuracy_model_c.append(accuracy_score(y_test, yc_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_c)

print ("Avg accuracy for KNN : ", np.array(accuracy_model_c).mean())
cm = confusion_matrix(y_test, yc_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,yc_pred))

In [None]:
############################LR model ##########################################

import matplotlib.patches as patches
from sklearn.model_selection import KFold 
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)
accuracy_model_a = []
model_a = LogisticRegression(random_state=42, max_iter= 10**4)

# plot arrows
fig1 = plt.figure(figsize=[12,12])
ax1 = fig1.add_subplot(111,aspect = 'equal')
ax1.add_patch(
    patches.Arrow(0.45,0.5,-0.25,0.25,width=0.3,color='green',alpha = 0.5)
    )
ax1.add_patch(
    patches.Arrow(0.5,0.45,0.25,-0.25,width=0.3,color='red',alpha = 0.5)
    )

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    prediction = model_a.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ya_pred = model_a.predict(X_test)
    accuracy_model_a.append(accuracy_score(y_test, ya_pred))
    plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc="lower right")
plt.text(0.32,0.7,'More accurate area',fontsize = 12)
plt.text(0.63,0.4,'Less accurate area',fontsize = 12)
plt.show()

print(accuracy_model_a)

print ("Avg accuracy for LR:  ", np.array(accuracy_model_a).mean())
cm = confusion_matrix(y_test, ya_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
plt.show()

print(classification_report(y_test,ya_pred))

In [None]:
from sklearn.model_selection import RepeatedKFold 
import xgboost as xgb
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
kf = RepeatedKFold(n_splits=5, n_repeats=5, random_state=1)

accuracy_model_e = []
model_e = xgb.XGBClassifier(use_label_encoder =False, eval_metric='mlogloss', )
accuracy_model_d = []
model_d = AdaBoostClassifier(random_state=42)
accuracy_model_c = []
model_c = KNeighborsClassifier(n_neighbors=91)
accuracy_model_b = []
model_b = RandomForestClassifier(n_estimators=100, random_state=521)
accuracy_model_b1 = []
model_b1 = RandomForestClassifier(n_estimators=200, random_state=521)
accuracy_model_a = []
model_a = LogisticRegression(random_state=42, max_iter= 10**6)
accuracy_model_f = []
model_f = DecisionTreeClassifier(random_state=42)

fig1 = plt.figure(figsize=[8,6])

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_d.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yd_pred = model_d.predict(X_test)
    accuracy_model_d.append(accuracy_score(y_test, yd_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='blue',
         label=r'Mean ROC AdaBoost(AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)



tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_e.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ye_pred = model_e.predict(X_test)
    accuracy_model_e.append(accuracy_score(y_test, ye_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='black',
         label=r'Mean ROC XGBoost (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_a.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    ya_pred = model_e.predict(X_test)
    accuracy_model_a.append(accuracy_score(y_test, ya_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='green',
         label=r'Mean ROC LR, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_b.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb_pred = model_b.predict(X_test)
    accuracy_model_b.append(accuracy_score(y_test, yb_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='yellow',
         label=r'Mean ROC RF 100, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_b1.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yb1_pred = model_b1.predict(X_test)
    accuracy_model_b1.append(accuracy_score(y_test, yb1_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='purple',
         label=r'Mean ROC RF 200, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)

tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_c.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yc_pred = model_c.predict(X_test)
    accuracy_model_c.append(accuracy_score(y_test, yc_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='orange',
         label=r'Mean ROC KNN, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)


tprs = []
aucs = []
mean_fpr = np.linspace(0,1,100)
i = 1
for train_index,test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index] 
    y_train, y_test = y[train_index], y[test_index]
    prediction = model_f.fit(X_train,y_train).predict_proba(X_test)
    fpr, tpr, t = roc_curve(y_test, prediction[:, 1])
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    yf_pred = model_f.predict(X_test)
    accuracy_model_f.append(accuracy_score(y_test, yf_pred))
    #plt.plot(fpr, tpr, lw=2, alpha=0.3, label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))
    i= i+1
plt.plot([0,1],[0,1],linestyle = '--',lw = 2,color = 'black')
mean_tpr = np.mean(tprs, axis=0)
mean_auc = auc(mean_fpr, mean_tpr)
plt.plot(mean_fpr, mean_tpr, color='magenta',
         label=r'Mean ROC DT, (AUC = %0.2f )' % (mean_auc),lw=2, alpha=1)



plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=14)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC')
plt.legend(loc="lower right")

plt.show()

In [None]:
fig.savefig('multiple_roc_curve_comparison.png')