In [1]:
import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier as DTC, export_graphviz
import pydot
import pydotplus

import time

In [2]:
Train_data = pd.read_csv('./data/car_info_train.csv')
Train_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46444 entries, 0 to 46443
Data columns (total 17 columns):
CUST_ID                 46444 non-null object
CUST_SEX                46444 non-null int64
CUST_AGE                46444 non-null float64
CUST_MARRY              11018 non-null object
BUYERPART               46444 non-null int64
CAR_MODEL               46444 non-null object
CAR_AGE                 46444 non-null float64
CAR_PRICE               46444 non-null float64
IS_LOAN                 46444 non-null float64
LOAN_PERIED             46444 non-null int64
LOAN_AMOUNT             46444 non-null float64
F_INSORNOT              46444 non-null float64
ALL_BUYINS_N            46444 non-null float64
DLRSI_CNT               46444 non-null float64
GLASSBUYSEPARATE_CNT    46444 non-null float64
SII_CNT                 46444 non-null float64
IS_LOST                 46444 non-null int64
dtypes: float64(10), int64(4), object(3)
memory usage: 5.5+ MB


In [3]:
numerical_cols = Train_data.select_dtypes(exclude = 'object').columns
print(numerical_cols)

Index(['CUST_SEX', 'CUST_AGE', 'BUYERPART', 'CAR_AGE', 'CAR_PRICE', 'IS_LOAN',
       'LOAN_PERIED', 'LOAN_AMOUNT', 'F_INSORNOT', 'ALL_BUYINS_N', 'DLRSI_CNT',
       'GLASSBUYSEPARATE_CNT', 'SII_CNT', 'IS_LOST'],
      dtype='object')


In [4]:
Train_data['CUST_AGE'] = pd.qcut(Train_data.CUST_AGE, 3, labels=["1", "2", "3"])
Train_data['CAR_AGE'] = pd.qcut(Train_data.CAR_AGE, 5, labels=["1", "2", "3", "4", "5"])
Train_data['CAR_PRICE'] = pd.qcut(Train_data.CAR_PRICE, 4, labels=["1", "2", "3", "4"])
Train_data['LOAN_AMOUNT'] = pd.qcut(Train_data.LOAN_AMOUNT, 4, labels=["1", "2", "3", "4"])

In [5]:
Train_data.head()

Unnamed: 0,CUST_ID,CUST_SEX,CUST_AGE,CUST_MARRY,BUYERPART,CAR_MODEL,CAR_AGE,CAR_PRICE,IS_LOAN,LOAN_PERIED,LOAN_AMOUNT,F_INSORNOT,ALL_BUYINS_N,DLRSI_CNT,GLASSBUYSEPARATE_CNT,SII_CNT,IS_LOST
0,49BB37423D72EAABEF899EF02488F30C,2,1,,1,车型2,1,3,1.0,5,3,1.0,1.0,0.0,0.0,0.0,0
1,303CC4F388283ABF7BADB6EF81D3D812,1,1,,1,车型2,1,3,1.0,4,3,1.0,2.0,0.0,0.0,0.0,0
2,5DCFA06D1A001B1E2F3095B097B2A452,1,3,已婚,1,车型2,1,3,1.0,5,3,1.0,1.0,0.0,1.0,0.0,0
3,8B1284CF18DEA31A8C14A5A3554E50DC,1,3,,1,车型2,1,2,1.0,4,3,1.0,2.0,1.0,0.0,0.0,0
4,FCD67DC870371A98FE382852328086BD,2,1,已婚,1,车型2,1,3,1.0,4,2,1.0,2.0,0.0,0.0,0.0,0


In [6]:
Train_data.isnull().any()

CUST_ID                 False
CUST_SEX                False
CUST_AGE                False
CUST_MARRY               True
BUYERPART               False
CAR_MODEL               False
CAR_AGE                 False
CAR_PRICE               False
IS_LOAN                 False
LOAN_PERIED             False
LOAN_AMOUNT             False
F_INSORNOT              False
ALL_BUYINS_N            False
DLRSI_CNT               False
GLASSBUYSEPARATE_CNT    False
SII_CNT                 False
IS_LOST                 False
dtype: bool

In [7]:
feature_cols = [col for col in numerical_cols if col != 'IS_LOST']
X_data = Train_data[feature_cols]
Y_data = Train_data['IS_LOST']
X_data


Unnamed: 0,CUST_SEX,CUST_AGE,BUYERPART,CAR_AGE,CAR_PRICE,IS_LOAN,LOAN_PERIED,LOAN_AMOUNT,F_INSORNOT,ALL_BUYINS_N,DLRSI_CNT,GLASSBUYSEPARATE_CNT,SII_CNT
0,2,1,1,1,3,1.0,5,3,1.0,1.0,0.0,0.0,0.0
1,1,1,1,1,3,1.0,4,3,1.0,2.0,0.0,0.0,0.0
2,1,3,1,1,3,1.0,5,3,1.0,1.0,0.0,1.0,0.0
3,1,3,1,1,2,1.0,4,3,1.0,2.0,1.0,0.0,0.0
4,2,1,1,1,3,1.0,4,2,1.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
46439,2,2,1,5,3,1.0,4,4,1.0,2.0,0.0,1.0,0.0
46440,1,2,1,5,3,1.0,5,4,0.0,1.0,0.0,1.0,0.0
46441,1,2,1,5,3,1.0,5,4,1.0,1.0,1.0,1.0,0.0
46442,1,3,2,5,3,1.0,5,4,1.0,4.0,1.0,1.0,0.0


In [8]:
dtc = DTC(criterion='entropy')
dtc.fit(X_data, Y_data)
print('准确率:', dtc.score(X_data, Y_data))
with open('./tree.dot', 'w') as f:
    f = export_graphviz(dtc, feature_names=X_data.columns, out_file=f)

准确率: 0.8581732839548704


In [9]:
import pydot
import pydotplus
graph = pydotplus.graph_from_dot_file('./tree.dot')

InvocationException: GraphViz's executables not found