In [43]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [44]:
base_credit = pd.read_csv('/content/drive/MyDrive/Projetos/ML/Census and Base_credit/credit_data.csv')
base_credit

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.642260,0
4,5,66952.688845,18.584336,8770.099235,1
...,...,...,...,...,...
1995,1996,59221.044874,48.518179,1926.729397,0
1996,1997,69516.127573,23.162104,3503.176156,0
1997,1998,44311.449262,28.017167,5522.786693,1
1998,1999,43756.056605,63.971796,1622.722598,0


In [45]:
base_credit.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [46]:
base_credit.loc[base_credit['age'] <= 0, 'age']

15   -28.218361
21   -52.423280
26   -36.496976
Name: age, dtype: float64

In [47]:
media = base_credit['age'].mean()

In [48]:
base_credit.loc[base_credit['age'] <= 0, 'age'] = media

In [49]:
base_credit.loc[[15,21,26], 'age']

15    40.807559
21    40.807559
26    40.807559
Name: age, dtype: float64

In [50]:
base_credit.isnull().sum()

clientid    0
income      0
age         3
loan        0
default     0
dtype: int64

In [51]:
base_credit.update(base_credit['age'].fillna(media))

In [52]:
base_credit.isnull().sum()

clientid    0
income      0
age         0
loan        0
default     0
dtype: int64

In [53]:
base_credit.head()

Unnamed: 0,clientid,income,age,loan,default
0,1,66155.925095,59.017015,8106.532131,0
1,2,34415.153966,48.117153,6564.745018,0
2,3,57317.170063,63.108049,8020.953296,0
3,4,42709.534201,45.751972,6103.64226,0
4,5,66952.688845,18.584336,8770.099235,1


In [54]:
X_credit = base_credit.iloc[:, 1:4]
X_credit

Unnamed: 0,income,age,loan
0,66155.925095,59.017015,8106.532131
1,34415.153966,48.117153,6564.745018
2,57317.170063,63.108049,8020.953296
3,42709.534201,45.751972,6103.642260
4,66952.688845,18.584336,8770.099235
...,...,...,...
1995,59221.044874,48.518179,1926.729397
1996,69516.127573,23.162104,3503.176156
1997,44311.449262,28.017167,5522.786693
1998,43756.056605,63.971796,1622.722598


In [55]:
y_credit = base_credit.iloc[:, 4]
y_credit

0       0
1       0
2       0
3       0
4       1
       ..
1995    0
1996    0
1997    1
1998    0
1999    0
Name: default, Length: 2000, dtype: int64

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_credit,y_credit, test_size=0.3, random_state=0)
gnb = GaussianNB()
y_predict = gnb.fit(X_train, y_train).predict(X_test)


In [57]:
len(y_predict)

600

In [58]:
ac_naive = accuracy_score(y_test, y_predict)
ac_naive

0.935

In [59]:
confusion_matrix(y_test,y_predict)

array([[515,   9],
       [ 30,  46]])

Arvore de decisão

In [60]:
tree_decision = DecisionTreeClassifier()

In [61]:
prev = tree_decision.fit(X_train, y_train).predict(X_test)

In [62]:
ac_decision_tree = accuracy_score(y_test, prev)
ac_decision_tree

0.9833333333333333

In [63]:
confusion_matrix(y_test,prev)

array([[516,   8],
       [  2,  74]])

Random Forest

In [64]:
random_forest = RandomForestClassifier(n_estimators=115, criterion='entropy', random_state=0)

In [65]:
random_forest_prev = random_forest.fit(X_train, y_train).predict(X_test)

In [66]:
ac_random_forest= accuracy_score(y_test, random_forest_prev)
ac_random_forest

0.985

In [67]:
confusion_matrix(y_test,random_forest_prev)

array([[518,   6],
       [  3,  73]])

In [68]:
df_comparacao = pd.DataFrame(columns=['algoritmo', 'accuracy'], index=[0,1,2])

In [69]:
df_comparacao.loc[0]['algoritmo'] = 'Naive_Bayes'
df_comparacao.loc[1]['algoritmo'] = 'Decision_Tree'
df_comparacao.loc[2]['algoritmo'] = 'Random_Forest'
df_comparacao.loc[0]['accuracy'] = ac_naive
df_comparacao.loc[1]['accuracy'] = ac_decision_tree
df_comparacao.loc[2]['accuracy'] = ac_random_forest

In [70]:
df_comparacao

Unnamed: 0,algoritmo,accuracy
0,Naive_Bayes,0.935
1,Decision_Tree,0.983333
2,Random_Forest,0.985
