In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from xgboost import to_graphviz

%matplotlib inline

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve

pd.options.display.max_columns= None

In [2]:
df = pd.read_csv("GaltonFamilies.csv")

In [3]:
df

Unnamed: 0,Case,family,father,mother,midparentHeight,children,childNum,gender,childHeight
0,1,1,78.5,67.0,75.43,4,1,male,73.2
1,2,1,78.5,67.0,75.43,4,2,female,69.2
2,3,1,78.5,67.0,75.43,4,3,female,69.0
3,4,1,78.5,67.0,75.43,4,4,female,69.0
4,5,2,75.5,66.5,73.66,4,1,male,73.5
...,...,...,...,...,...,...,...,...,...
929,930,203,62.0,66.0,66.64,3,1,male,64.0
930,931,203,62.0,66.0,66.64,3,2,female,62.0
931,932,203,62.0,66.0,66.64,3,3,female,61.0
932,933,204,62.5,63.0,65.27,2,1,male,66.5


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 934 entries, 0 to 933
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Case             934 non-null    int64  
 1   family           934 non-null    object 
 2   father           934 non-null    float64
 3   mother           934 non-null    float64
 4   midparentHeight  934 non-null    float64
 5   children         934 non-null    int64  
 6   childNum         934 non-null    int64  
 7   gender           934 non-null    object 
 8   childHeight      934 non-null    float64
dtypes: float64(4), int64(3), object(2)
memory usage: 65.8+ KB


In [5]:
df.describe()

Unnamed: 0,Case,father,mother,midparentHeight,children,childNum,childHeight
count,934.0,934.0,934.0,934.0,934.0,934.0,934.0
mean,467.5,69.197109,64.089293,69.206773,6.171306,3.585653,66.745931
std,269.766875,2.476479,2.290886,1.80237,2.729025,2.36141,3.579251
min,1.0,62.0,58.0,64.4,1.0,1.0,56.0
25%,234.25,68.0,63.0,68.14,4.0,2.0,64.0
50%,467.5,69.0,64.0,69.248,6.0,3.0,66.5
75%,700.75,71.0,65.875,70.14,8.0,5.0,69.7
max,934.0,78.5,70.5,75.43,15.0,15.0,79.0


In [6]:
df.isnull().sum()

Case               0
family             0
father             0
mother             0
midparentHeight    0
children           0
childNum           0
gender             0
childHeight        0
dtype: int64

In [None]:
df = df.select_dtypes(exclude=['object'])

In [None]:
df

In [None]:
sns.countplot(df['CreditStatus'])
plt.show()

In [None]:
df.describe()

In [None]:
X = df.iloc[:,0:7]
y = df.iloc[:,7]

In [None]:
X.values, y.values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [None]:
X_train.shape, X_test.shape

In [None]:
xgb = XGBClassifier(random_state=0, n_estimators=200)

In [None]:
xgb.fit(X_train,y_train,eval_set=[(X_test,y_test)],eval_metric='error',early_stopping_rounds=20)

In [None]:
y_pred = xgb.predict(X_test)

In [None]:
y_pred

In [None]:
plot_confusion_matrix(xgb,X_test,y_test)
plt.show()

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
plot_roc_curve(xgb,X_test,y_test)
plt.show()