In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# load the data
file_url = 'https://archive.ics.uci.edu/static/public/222/data.csv'
df = pd.read_csv(file_url)

# convert target var to numerical
df.y = df.y.map({'yes':1,'no':0})

# fill na
df.fillna('unknown', inplace=True)

# drop duration
df.drop('duration', axis=1, inplace=True)

from sklearn.model_selection import train_test_split

# split the data into train/val/test with 60%/20%/20%
df_full, df_test = train_test_split(df, test_size=np.round(len(df)*.2).astype(int), random_state=42)
df_train, df_val = train_test_split(df_full, test_size=np.round(len(df)*.2).astype(int), random_state=42)

In [11]:
cat = df_train.columns[df_train.dtypes=='object'].tolist()
num = df_train.columns[df_train.dtypes!='object'].tolist()
features = cat+num

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

df_train=df_train[features].to_dict(orient='records')
df_val=df_val[features].to_dict(orient='records')
df_test=df_test[features].to_dict(orient='records')

from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
dv.fit(df_train)

X_train = dv.transform(df_train)
X_val = dv.transform(df_val)
X_test = dv.transform(df_test)

In [12]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import roc_curve, roc_auc_score

scores = []

# TRAINING THE REGRESSION MODEL:
for depth in range(1,11,1):
    dt = DecisionTreeRegressor(max_depth=depth)
    dt.fit(X_train, y_train)

    y_pred = dt.predict(X_val)
    scores.append((
        depth, 
        roc_auc_score(y_train, dt.predict(X_train)),
        roc_auc_score(y_val, y_pred)
        ))

In [21]:
pd.Series(y_train).value_counts()

0    23979
1     3148
Name: count, dtype: int64

In [23]:
pd.Series(y_val).value_counts()

0    7992
1    1050
Name: count, dtype: int64

In [24]:
pd.Series(y_pred).value_counts()

0.0    7992
1.0    1050
Name: count, dtype: int64