In [1]:
#data prediction using random forest classifier

In [2]:
#importing all the libraries required

In [3]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score  
from sklearn.metrics import classification_report

In [4]:
#importing the train and test data

In [5]:
train=pd.read_csv("./train.csv")
test=pd.read_csv("./test.csv")

In [6]:
#dropping the dependents column from the datasets as it does not plays role in loan prediction

In [7]:
train=train.drop(columns=["Dependents"])
test=test.drop(columns=["Dependents"])

In [8]:
#number of null values in train and test dataset 

In [9]:
train.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [10]:
test.isnull().sum()

Loan_ID               0
Gender               11
Married               0
Education             0
Self_Employed        23
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount            5
Loan_Amount_Term      6
Credit_History       29
Property_Area         0
dtype: int64

In [11]:
#filling NaNs of columns loan amount and loan amount term with their respective mean of the column

In [12]:
train.LoanAmount=train.LoanAmount.fillna(train.LoanAmount.mean())
train.Loan_Amount_Term=train.Loan_Amount_Term.fillna(train.LoanAmount.mean())
test.LoanAmount=test.LoanAmount.fillna(test.LoanAmount.mean())
test.Loan_Amount_Term=test.Loan_Amount_Term.fillna(test.LoanAmount.mean())

In [13]:
#removing the remaining NaNs with frequently ocurring values

In [14]:
columns=["Gender","Self_Employed","Credit_History","Married"]
for col in columns:
    train[col]=train[col].fillna(train[col].value_counts().idxmax())
    test[col]=test[col].fillna(test[col].value_counts().idxmax())

In [15]:
#converting categorical data or text data into numbers 

In [16]:
le = preprocessing.LabelEncoder()

le.fit(train["Gender"])
train["Gender"]=le.transform(train["Gender"])
test["Gender"]=le.transform(test["Gender"])

le.fit(train["Married"])
train["Married"]=le.transform(train["Married"])
test["Married"]=le.transform(test["Married"])

le.fit(train["Education"])
train["Education"]=le.transform(train["Education"])
test["Education"]=le.transform(test["Education"])

le.fit(train["Property_Area"])
train["Property_Area"]=le.transform(train["Property_Area"])
test["Property_Area"]=le.transform(test["Property_Area"])

le.fit(train["Self_Employed"])
train["Self_Employed"]=le.transform(train["Self_Employed"])
test["Self_Employed"]=le.transform(test["Self_Employed"])


le.fit(train["Loan_Status"])
train["Loan_Status"]=le.transform(train["Loan_Status"])

In [17]:
#slicing to get labels and target

In [18]:
train_x=train.iloc[:,1:11]
train_y=train.iloc[:,-1]

In [19]:
#splitting the train dataset(training only on 80% of the train dataset)

In [20]:
train_x_train,train_x_test,train_y_train,train_y_test= train_test_split(train_x,train_y,test_size = 0.3,random_state = 42)

In [21]:
#fitting the splitted 80% in random forest classifier

In [22]:
model = RandomForestClassifier(n_estimators=100, random_state=42, max_features = 'auto',n_jobs=-1, verbose = 1)
model.fit(train_x_train,train_y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.3s finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=1,
                       warm_start=False)

In [23]:
#making the prediction on the remaining 20% and comparing it with the actual results

In [24]:
predictions = model.predict(train_x_test)
actual = train_y_test 
predicted = predictions
results = confusion_matrix(actual, predicted) 

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [25]:
#printing the results using confusion matrix

In [26]:
print ('Confusion Matrix :')
print(results) 
print ('Accuracy Score :',accuracy_score(actual, predicted)) 
print ('Report : ')
print (classification_report(actual, predicted) )

Confusion Matrix :
[[ 29  36]
 [  7 113]]
Accuracy Score : 0.7675675675675676
Report : 
              precision    recall  f1-score   support

           0       0.81      0.45      0.57        65
           1       0.76      0.94      0.84       120

    accuracy                           0.77       185
   macro avg       0.78      0.69      0.71       185
weighted avg       0.77      0.77      0.75       185



In [27]:
#using the model to predict the test dataset

In [28]:
ans=model.predict(test.drop('Loan_ID', axis=1))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [29]:
#inverse transforming the answer we get on predicting

In [30]:
ans=le.inverse_transform(ans)

In [31]:
#writing the final output in csv file

In [32]:
pd.DataFrame(data = {"Loan_ID":test["Loan_ID"], "Loan_Status":ans})
result = pd.DataFrame(data = {"Loan_ID":test["Loan_ID"], "Loan_Status":ans})
result.to_csv("./Random_Forest.csv",index=False)