In [1]:
import pandas as pd
import datetime
import numpy
import re
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn import decomposition, ensemble
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Function for calculating age of a person given his date of birth
from datetime import date
def calculate_age(born):
    today = date.today()
    return today.year - born.year - ((today.month, today.day) < (born.month, born.day))

In [3]:
#Function for calculating number of days between two dates 
def datediff(any_date):
    today = date.today()
    datediff = today-any_date 
    return datediff.days

In [4]:
# Calculating the Tenure in months given the data is in X Years and Y months format
def age_length(TenureAsYearsAndMonths):
    years = int(re.findall(r'(\d+)yrs',TenureAsYearsAndMonths )[0])
    months = int(re.findall(r'(\d+)mon',TenureAsYearsAndMonths )[0])
    total_tenure = (years*12)+months
    return total_tenure

In [5]:
# Function for doing all the data manipulations
def AllDataManipulations(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    Manip_Data[['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
        ['loan_default','branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES',
       'loan_default']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed

In [6]:
MasterData = pd.read_csv("train.csv")
Data1 = MasterData.copy()

In [7]:
Train_ManipData_1,Train_ManipCompleted_1 = AllDataManipulations(Data1)

In [8]:
Train_ManipCompleted_1.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Age',
       'HowManyDaysSinceDisburse', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AvgAcctAgeInMonths',
       'CredHistLenInMonts', 'NO.OF_INQUIRIES', 'loan_default',
       'EmploymentType_Salaried', 'EmploymentType_Self employed',
       'Bureau_score_description_A-Very Low Risk',
       'Bureau_score_description_B-Very Low Risk',

In [9]:
IndeVectors = Train_ManipCompleted_1[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Age',
       'HowManyDaysSinceDisburse', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AvgAcctAgeInMonths',
       'CredHistLenInMonts', 'NO.OF_INQUIRIES', 
       'EmploymentType_Salaried', 'EmploymentType_Self employed',
       'Bureau_score_description_A-Very Low Risk',
       'Bureau_score_description_B-Very Low Risk',
       'Bureau_score_description_C-Very Low Risk',
       'Bureau_score_description_D-Very Low Risk',
       'Bureau_score_description_E-Low Risk',
       'Bureau_score_description_F-Low Risk',
       'Bureau_score_description_G-Low Risk',
       'Bureau_score_description_H-Medium Risk',
       'Bureau_score_description_I-Medium Risk',
       'Bureau_score_description_J-High Risk',
       'Bureau_score_description_K-High Risk',
       'Bureau_score_description_L-Very High Risk',
       'Bureau_score_description_M-Very High Risk',
       'Bureau_score_description_No Bureau History Available',
       'Bureau_score_description_Not Scored: More than 50 active Accounts found',
       'Bureau_score_description_Not Scored: No Activity seen on the customer (Inactive)',
       'Bureau_score_description_Not Scored: No Updates available in last 36 months',
       'Bureau_score_description_Not Scored: Not Enough Info available on the customer',
       'Bureau_score_description_Not Scored: Only a Guarantor',
       'Bureau_score_description_Not Scored: Sufficient History Not Available']]
Labels = Train_ManipCompleted_1['loan_default']

In [10]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(IndeVectors,Labels,random_state = 1234,test_size = 0.10)

# Random forest
rf_model_1 = RandomForestClassifier(n_estimators=300,bootstrap = True,max_features = 'sqrt')
rf_model_1.fit(train_x,list(train_y))
ypred_rf1 = rf_model_1.predict(valid_x)
auc_rf1 = metrics.roc_auc_score(valid_y,ypred_rf1)
print("AUC of Random forest Model_1: ",auc_rf1)
# Naive Bayes
gnb = GaussianNB()
gnb.fit(train_x,train_y)
ypred_gnb = gnb.predict(valid_x)
auc_gnb = metrics.roc_auc_score(valid_y,ypred_gnb)
print("AUC of Naive Bayes Model_1: ",auc_gnb)

In [11]:
# Modifying data manipulations function to work for test data as it doesn't have labels column
# Function for doing all the data manipulations
def AllDataManipulations_forTest(MainDataFrame,Manip_Data = pd.DataFrame,temp1 = pd.DataFrame, temp2 = pd.DataFrame,Manip_Completed = pd.DataFrame):
    Manip_Data = MainDataFrame.copy()
    Manip_Data['Employment.Type'].fillna(Manip_Data['Employment.Type'].mode()[0],inplace = True)
    Manip_Data[['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']] = Manip_Data[
        ['branch_id','supplier_id','manufacturer_id','Current_pincode_ID','State_ID','Employee_code_ID']].apply(lambda x: x.astype('category'))
    Manip_Data['Date.of.Birth'] = pd.to_datetime(Manip_Data['Date.of.Birth'])
    Manip_Data['Age'] = Manip_Data['Date.of.Birth'].apply(lambda x: calculate_age(x))
    Manip_Data['DisbursalDate'] = pd.to_datetime(Manip_Data['DisbursalDate'])
    Manip_Data['HowManyDaysSinceDisburse'] = Manip_Data['DisbursalDate'].dt.date.apply(lambda x: datediff(x))
    Manip_Data['AvgAcctAgeInMonths'] = Manip_Data['AVERAGE.ACCT.AGE'].apply(lambda x: age_length(x))
    Manip_Data['CredHistLenInMonts'] = Manip_Data['CREDIT.HISTORY.LENGTH'].apply(lambda x: age_length(x))
    temp1 = pd.get_dummies(Manip_Data['Employment.Type'],prefix='EmploymentType')
    temp2 = pd.get_dummies(Manip_Data['PERFORM_CNS.SCORE.DESCRIPTION'],prefix='Bureau_score_description')
    Manip_Completed = pd.concat([Manip_Data[['disbursed_amount', 'asset_cost', 'ltv', 'branch_id',
       'supplier_id', 'manufacturer_id', 'Current_pincode_ID','Age',
       'HowManyDaysSinceDisburse','State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE',
       'PRI.NO.OF.ACCTS', 'PRI.ACTIVE.ACCTS','PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE', 'PRI.SANCTIONED.AMOUNT',
       'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS', 'SEC.ACTIVE.ACCTS',
       'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE', 'SEC.SANCTIONED.AMOUNT',
       'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT', 'SEC.INSTAL.AMT',
       'NEW.ACCTS.IN.LAST.SIX.MONTHS', 'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS',
       'AvgAcctAgeInMonths','CredHistLenInMonts','NO.OF_INQUIRIES']],temp1,temp2],axis=1)
    return Manip_Data,Manip_Completed

In [12]:
Master_test = pd.read_csv("test_bqCt9Pv.csv")
Test_Data = Master_test.copy()
Test_ManipData,Test_ManipCompleted = AllDataManipulations_forTest(Test_Data)

In [13]:
Test_Data.head()

Unnamed: 0,UniqueID,disbursed_amount,asset_cost,ltv,branch_id,supplier_id,manufacturer_id,Current_pincode_ID,Date.of.Birth,Employment.Type,...,SEC.CURRENT.BALANCE,SEC.SANCTIONED.AMOUNT,SEC.DISBURSED.AMOUNT,PRIMARY.INSTAL.AMT,SEC.INSTAL.AMT,NEW.ACCTS.IN.LAST.SIX.MONTHS,DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS,AVERAGE.ACCT.AGE,CREDIT.HISTORY.LENGTH,NO.OF_INQUIRIES
0,655269,53478,63558,86.54,67,22807,45,1497,01-01-74,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
1,723482,55513,63163,89.45,67,22807,45,1497,20-05-85,Self employed,...,0,0,0,5605,0,1,0,0yrs 8mon,1yrs 0mon,1
2,758529,65282,84320,79.93,78,23135,86,2071,14-10-95,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0
3,763449,46905,63896,76.58,78,17014,45,2070,01-06-73,Self employed,...,0,0,0,0,0,0,0,2yrs 5mon,2yrs 5mon,0
4,708663,51428,63896,86.08,78,17014,45,2069,01-06-72,Salaried,...,0,0,0,0,0,0,0,0yrs 0mon,0yrs 0mon,0


In [14]:
Test_ManipCompleted.columns

Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Age',
       'HowManyDaysSinceDisburse', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AvgAcctAgeInMonths',
       'CredHistLenInMonts', 'NO.OF_INQUIRIES', 'EmploymentType_Salaried',
       'EmploymentType_Self employed',
       'Bureau_score_description_A-Very Low Risk',
       'Bureau_score_description_B-Very Low Risk',
       'Bureau_

In [15]:
Test_ManipCompleted['Bureau_score_description_Not Scored: More than 50 active Accounts found'] = 0

ActualTestPred_rf1 = rf_model_1.predict(Test_ManipCompleted)
ActualTestPred_nb = gnb.predict(Test_ManipCompleted)
RF1_Submissions = pd.concat([Test_Data['UniqueID'],pd.DataFrame(ActualTestPred_rf1,columns=['loan_default'])],axis=1)
nb_Submissions = pd.concat([Test_Data['UniqueID'],pd.DataFrame(ActualTestPred_nb,columns=['loan_default'])],axis=1)

RF1_Submissions.to_csv('RF1_Submissions.csv',index=False)

# Finding Feature importance
RF_feature_importances = pd.DataFrame(rf_model_1.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance',ascending=False)

In [278]:
# Function to % defaulted for each supplier
def Supplier_DefaulterFinder(TotalDataFrame,tempDf_1=pd.DataFrame(),tempDf_2 =pd.DataFrame(),tempDf_3 = pd.DataFrame()):
    tempDf_1 = pd.DataFrame(TotalDataFrame['supplier_id'].value_counts())
    tempDf_1.reset_index(level=0, inplace=True)
    tempDf_1.columns = ['supplier_id_1','TotalNoOfVehiclesSold']
    tempDf_2 = pd.DataFrame(TotalDataFrame.groupby('supplier_id')['loan_default'].value_counts())
    tempDf_2.reset_index(level=0, inplace=True)
    tempDf_2.columns = ['supplier_id_2','NoOfDefaulted_Sales']
    tempDf_2.reset_index(level=0, inplace=True)
    tempDf_2 = tempDf_2[(tempDf_2['loan_default'] == 1)]
    tempDf_1['supplier_id_1'] = tempDf_1['supplier_id_1'].astype('int')
    tempDf_2['supplier_id_2'] = tempDf_2['supplier_id_2'].astype('int')
    tempDf3 = pd.merge(tempDf_2, tempDf_1, how='right', on= ['supplier_id_2','supplier_id_1'],
                       left_index=False, right_index=False)
    tempDf3 = tempDf3[['supplier_id','TotalNoOfVehiclesSold','NoOfDefaulted_Sales']]
    tempDf3['Defaults%_DueToSupplier'] = ((tempDf3['NoOfDefaulted_Sales'])/(tempDf3['TotalNoOfVehiclesSold']))*100
    return tempDf3

In [16]:
Supplier_Master = Train_ManipCompleted_1.copy()

In [18]:
print(len(Supplier_Master.columns))
print(Supplier_Master.columns)

60
Index(['disbursed_amount', 'asset_cost', 'ltv', 'branch_id', 'supplier_id',
       'manufacturer_id', 'Current_pincode_ID', 'Age',
       'HowManyDaysSinceDisburse', 'State_ID', 'Employee_code_ID',
       'MobileNo_Avl_Flag', 'Aadhar_flag', 'PAN_flag', 'VoterID_flag',
       'Driving_flag', 'Passport_flag', 'PERFORM_CNS.SCORE', 'PRI.NO.OF.ACCTS',
       'PRI.ACTIVE.ACCTS', 'PRI.OVERDUE.ACCTS', 'PRI.CURRENT.BALANCE',
       'PRI.SANCTIONED.AMOUNT', 'PRI.DISBURSED.AMOUNT', 'SEC.NO.OF.ACCTS',
       'SEC.ACTIVE.ACCTS', 'SEC.OVERDUE.ACCTS', 'SEC.CURRENT.BALANCE',
       'SEC.SANCTIONED.AMOUNT', 'SEC.DISBURSED.AMOUNT', 'PRIMARY.INSTAL.AMT',
       'SEC.INSTAL.AMT', 'NEW.ACCTS.IN.LAST.SIX.MONTHS',
       'DELINQUENT.ACCTS.IN.LAST.SIX.MONTHS', 'AvgAcctAgeInMonths',
       'CredHistLenInMonts', 'NO.OF_INQUIRIES', 'loan_default',
       'EmploymentType_Salaried', 'EmploymentType_Self employed',
       'Bureau_score_description_A-Very Low Risk',
       'Bureau_score_description_B-Very Low Ris

In [36]:
# Forming a dataSet to see Unique set of Vehicle suppliers
ListOfUniqueSuppliers = pd.DataFrame(Supplier_Master['supplier_id'].value_counts())
ListOfUniqueSuppliers.reset_index(level =0, inplace = True)
ListOfUniqueSuppliers.columns = ['supplier_id_1','NoOfVehiclesSold']
ListOfUniqueSuppliers.head()
print("Number of Suppliers available: ",len(ListOfUniqueSuppliers))
print("\nLooking if this data has any Null values: \n",ListOfUniqueSuppliers.isnull().sum())

Number of Suppliers available:  2953

Looking if this data has any Null values: 
 supplier_id_1       0
NoOfVehiclesSold    0
dtype: int64


In [47]:
ListOfUniqueSuppliers.head()

Unnamed: 0,supplier_id_1,NoOfVehiclesSold
0,18317,1432
1,15694,1300
2,15663,1275
3,17980,1268
4,14234,1258


In [37]:
#Following list of manipulations are to get the data of Suppliers, whose customers have defaulted with loan

In [43]:
Temp1 = pd.DataFrame(Supplier_Master.groupby('supplier_id')['loan_default'].value_counts())
Temp1.columns = ['NoOfVehiclesSold']
Temp1.reset_index(level =0, inplace = True)
Temp1.reset_index(level =0, inplace = True)
Temp1.head()

Unnamed: 0,loan_default,supplier_id,NoOfVehiclesSold
0,0,10524,3
1,1,10524,3
2,0,12311,3
3,0,12312,41
4,1,12312,5


In [44]:
temp2 = Temp1[Temp1['loan_default'] == 1]

In [45]:
temp2.head()

Unnamed: 0,loan_default,supplier_id,NoOfVehiclesSold
1,1,10524,3
4,1,12312,5
6,1,12374,14
8,1,12441,10
10,1,12456,16


In [46]:
temp2.columns = ['loan_default','supplier_id_2','NoOfVehiclesSold']

In [82]:
SupplierSideDefaultContrib = pd.DataFrame(columns = ['supplier_id','CalcIndex'])
#columns = ['supplier_id','CalcIndex']
for i in range(len(ListOfUniqueSuppliers)):
    if(ListOfUniqueSuppliers['supplier_id_1'][i] in list(temp2['supplier_id_2'])):
        SupplierSideDefaultContrib['supplier_id'][i] = ListOfUniqueSuppliers['supplier_id_1'][i]
        SupplierSideDefaultContrib.iloc['CalcIndex'][i] = (temp2['NoOfVehiclesSold']/ListOfUniqueSuppliers['NoOfVehiclesSold'])
    else:
        SupplierSideDefaultContrib['supplier_id'][i] = ListOfUniqueSuppliers['supplier_id_1'][i]
        SupplierSideDefaultContrib.iloc['CalcIndex'][i] = 0

IndexError: index 0 is out of bounds for axis 0 with size 0

In [75]:
SupplierSideDefaultContrib = pd.DataFrame(columns = ['supplier_id','CalcIndex'],index = range(len(ListOfUniqueSuppliers)))

In [76]:
SupplierSideDefaultContrib

Unnamed: 0,supplier_id,CalcIndex
0,,
1,,
2,,
3,,
4,,
5,,
6,,
7,,
8,,
9,,


In [67]:
for x in range(len(ListOfUniqueSuppliers)):
    print(ListOfUniqueSuppliers['supplier_id_1'][x])

18317
15694
15663
17980
14234
18166
21980
14375
22727
14145
14115
21124
18532
18473
14347
14178
15142
14770
21043
14791
16633
20470
21556
21003
20292
14078
16565
23137
14293
16694
17223
14710
17783
14834
21347
22350
22994
16277
17014
18471
17094
22045
15897
18062
18129
21308
16605
14823
15097
16120
22703
22993
15884
17921
16445
18130
15196
15685
13890
18310
20333
15217
15271
18077
21251
15899
16556
15919
23854
18643
22417
22998
14975
15805
15077
20520
22637
23026
14214
22928
23173
14004
20675
23799
23311
17315
23000
16309
23277
17139
17530
14189
17742
16092
17413
14143
18651
16846
21635
17916
22447
15996
23643
17408
17038
23550
16646
22004
18332
23770
16760
15905
22934
18180
15733
21264
17906
22842
17383
16166
17694
13971
16461
22857
14292
15357
18207
23360
23170
15798
16199
18486
15893
22936
23824
17705
20335
14698
18732
21773
21013
16487
23213
14151
16686
18125
15777
22775
18449
17113
23512
22764
18348
17684
17142
23146
22639
17901
14305
13948
16596
16803
22056
22856
22971
15460
1631

22322
24006
15503
22847
12842
17267
23090
15888
23002
23600
23196
24149
14136
16985
24163
16252
24096
24632
23730
22729
24200
15616
16239
22457
23127
14443
22515
23116
24052
23574
22963
14628
17116
24046
24014
24130
17023
24022
23786
16499
14697
24424
24077
22852
15706
23062
24168
24043
21221
23436
23053
18838
18410
23549
15636
15484
20787
18154
21669
24338
23455
20972
24332
23482
16805
23042
23942
24140
24612
20640
24526
21966
15693
22736
23890
24367
23984
24457
22952
23709
18722
18396
24577
23057
24169
17995
22599
24296
23566
17468
22913
15879
18072
23183
21908
23878
23234
23149
24208
24117
23343
18730
24425
23120
16350
23364
24333
24227
14606
16234
24283
16587
24089
18469
24053
14795
24732
21824
17627
24173
18658
22354
23812
22912
15272
15312
22633
15078
23212
22869
22961
23227
23594
23895
24421
24312
22922
18257
14804
16249
24506
17318
23244
17364
24273
23426
24610
23032
14005
23276
23293
20795
24325
24770
21429
22441
23868
18702
15212
20391
14610
23718
23798
23972
23634
23975
1787

In [68]:
ListOfUniqueSuppliers.tail()

Unnamed: 0,supplier_id_1,NoOfVehiclesSold
2948,24435,1
2949,24447,1
2950,16270,1
2951,24462,1
2952,24443,1


In [87]:
def SupplierConduct(df1,df2,temp_df=pd.DataFrame()):
    for i in range(len(df1['supplier_id_1'])):
        if(df1['supplier_id_1'][i] in list(df2['supplier_id_2'])):
            temp_df['supplier_id'][i] = df1['supplier_id_1'][i]
            temp_df['SupplierConductIndex'][i] = df2['NoOfVehiclesSold']/df1['NoOfVehiclesSold']
        else:
            temp_df['supplier_id'][i] = ListOfUniqueSuppliers['supplier_id_1'][i]
            temp_df['SupplierConductIndex'][i] = 0
    return temp_df

In [88]:
SupplierConduct_df = SupplierConduct(ListOfUniqueSuppliers,temp2)

KeyError: 'supplier_id'