In [59]:
import pandas as pd
import numpy as np
CreditData = pd.read_excel("CreditData2020.xls") 
print(CreditData.head(10))
print(CreditData.info())


   No  Status  LengthOfEmployment   Home  Age     Sex MaritalStatus Records  \
0   1       0                   5  other   23  female        single      no   
1   2       0                  23   rent   47    male        single      no   
2   3       0                  19  other   55    male       married      no   
3   4       0                  12   rent   37  female       married     yes   
4   5       0                   3  other   21  female        single     yes   
5   6       0                   0  other   31    male        single      no   
6   7       1                  20   rent   47  female       married      no   
7   8       0                  15  owner   42    male        single      no   
8   9       0                   3  owner   27  female       married      no   
9  10       0                  20  owner   41  female       married      no   

         Job  Expenses  Income  Assets  Debt  LoanAmount  DurationOfLoan  
0      fixed        35     120       0     0        130

In [79]:
#DATA PREPARATION
#exam the data for missing values
pd.isna(CreditData).sum()
#create dummy values
dummy1 = pd.get_dummies(CreditData['Home'], prefix='Home')
dummy2 = pd.get_dummies(CreditData['Sex'], prefix='Sex')
dummy3 = pd.get_dummies(CreditData['MaritalStatus'], prefix='MaritalStatus')
dummy4 = pd.get_dummies(CreditData['Records'], prefix='Records')
dummy5 = pd.get_dummies(CreditData['Job'], prefix='Job')

CreditDataDummy = pd.concat([CreditData,dummy1,dummy2,dummy3,dummy4,dummy5], axis=1)\
.drop(['Home', 'Sex', 'MaritalStatus','Records','Job','Home_rent','Sex_male','MaritalStatus_widow',\
       'Records_yes','Job_partime'],axis=1)
print(CreditDataDummy.head())

   No  Status  LengthOfEmployment  Age  Expenses  Income  Assets  Debt  \
0   1       0                   5   23        35     120       0     0   
1   2       0                  23   47        44      86       0     0   
2   3       0                  19   55        45     118    4000     0   
3   4       0                  12   37        86     110       0     0   
4   5       0                   3   21        35      75       0     0   

   LoanAmount  DurationOfLoan  Home_other  Home_owner  Sex_female  \
0        1300             5.0           1           0           1   
1         350             2.0           0           0           0   
2        1300             5.0           1           0           0   
3         700             5.0           0           0           1   
4         500             1.5           1           0           1   

   MaritalStatus_divorced  MaritalStatus_married  MaritalStatus_single  \
0                       0                      0                  

In [80]:
# let’s choose about 2/3 (0.67%) of the data for learning, and the rest
# for testing the model
# split data into training and testing sets
nData = len(CreditDataDummy.index)
nTrain = round(nData*0.67)
CreditDataTrain = CreditDataDummy[:nTrain]
CreditDataTest = CreditDataDummy[nTrain:]
CreditDataTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2979 entries, 0 to 2978
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   No                      2979 non-null   int64  
 1   Status                  2979 non-null   int64  
 2   LengthOfEmployment      2979 non-null   int64  
 3   Age                     2979 non-null   int64  
 4   Expenses                2979 non-null   int64  
 5   Income                  2979 non-null   int64  
 6   Assets                  2979 non-null   int64  
 7   Debt                    2979 non-null   int64  
 8   LoanAmount              2979 non-null   int64  
 9   DurationOfLoan          2979 non-null   float64
 10  Home_other              2979 non-null   uint8  
 11  Home_owner              2979 non-null   uint8  
 12  Sex_female              2979 non-null   uint8  
 13  MaritalStatus_divorced  2979 non-null   uint8  
 14  MaritalStatus_married   2979 non-null   

In [9]:
# import statistics package
import statsmodels.formula.api as smf


In [84]:
# estimate logistic regression model on the training data
all_columns1 = "+".join(CreditDataTrain.columns.difference(['No','Status']))
print(all_columns1)
formula1 = "Status~" + all_columns1 
print(formula1)
model = smf.logit(formula=formula1, data = CreditDataTrain).fit()

# results of model fit
print(model.summary())
print(model.params)
print(model.pvalues)

Age+Assets+Debt+DurationOfLoan+Expenses+Home_other+Home_owner+Income+Job_fixed+Job_freelance+Job_others+LengthOfEmployment+LoanAmount+MaritalStatus_divorced+MaritalStatus_married+MaritalStatus_single+Records_no+Sex_female
Status~Age+Assets+Debt+DurationOfLoan+Expenses+Home_other+Home_owner+Income+Job_fixed+Job_freelance+Job_others+LengthOfEmployment+LoanAmount+MaritalStatus_divorced+MaritalStatus_married+MaritalStatus_single+Records_no+Sex_female
Optimization terminated successfully.
         Current function value: 0.453889
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 Status   No. Observations:                 2979
Model:                          Logit   Df Residuals:                     2960
Method:                           MLE   Df Model:                           18
Date:                Tue, 05 Jan 2021   Pseudo R-squ.:                  0.2373
Time:                        15:01:14   Log-Likeliho

In [86]:
# remove variables with p-values greater than 0.01 and retrain the model
#Age, DurationOfLoan, Home_other, Job_others,MaritalStatus_married,MaritalStatus_single,Sex_female
## create dummy variables for Home and MaritalStatus
all_columns2 = "+".join(CreditDataTrain.columns.difference(['No','Status','Age', 'DurationOfLoan', \
'Home_other', 'Job_others','MaritalStatus_married','MaritalStatus_single','Sex_female']))
print(all_columns2)
formula2 = "Status~" + all_columns2 
print(formula2)
model2 = smf.logit(formula=formula2, data = CreditDataTrain).fit()

# results of model fit
print(model2.summary())
print(model2.params)
print(model2.pvalues)

Assets+Debt+Expenses+Home_owner+Income+Job_fixed+Job_freelance+LengthOfEmployment+LoanAmount+MaritalStatus_divorced+Records_no
Status~Assets+Debt+Expenses+Home_owner+Income+Job_fixed+Job_freelance+LengthOfEmployment+LoanAmount+MaritalStatus_divorced+Records_no
Optimization terminated successfully.
         Current function value: 0.456069
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                 Status   No. Observations:                 2979
Model:                          Logit   Df Residuals:                     2967
Method:                           MLE   Df Model:                           11
Date:                Tue, 05 Jan 2021   Pseudo R-squ.:                  0.2337
Time:                        15:03:30   Log-Likelihood:                -1358.6
converged:                       True   LL-Null:                       -1772.9
Covariance Type:            nonrobust   LLR p-value:                1.454e-170
     

In [103]:
#test the accuracy of the model with test dataset
CreditDataTest['Pred2'] = model2.predict(CreditDataTest)
CreditDataTest['PredStatus'] = np.where(CreditDataTest['Pred2']>=0.5, 1,0)
print(CreditDataTest.head(20))
Accuracy = (sum(CreditDataTest['Status']==CreditDataTest['PredStatus']))/len(CreditDataTest)
print(Accuracy)

        No  Status  LengthOfEmployment  Age  Expenses  Income  Assets  Debt  \
2979  2980       0                   5   34        45     137   20000     0   
2980  2981       0                  10   36        90     172    3000     0   
2981  2982       0                  12   28        45     148    3500   144   
2982  2983       0                  35   55        60      84   13000     0   
2983  2984       0                  10   40        90     301    2000     0   
2984  2985       0                  11   29        44      93       0     0   
2985  2986       1                   0   64        35     100    8000     0   
2986  2987       1                   3   26        35     169   60000  3000   
2987  2988       1                  20   43        45     120   10000     0   
2988  2989       0                  26   55        60     110   10000     0   
2989  2990       0                  13   31        45     180    7500  1400   
2990  2991       0                   1   64        7

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CreditDataTest['Pred2'] = model2.predict(CreditDataTest)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  CreditDataTest['PredStatus'] = np.where(CreditDataTest['Pred2']>=0.5, 1,0)
