In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = ["parents", "has_nurs", "form", "children",
           "housing", "finance", "social", "health", 
           "target"]

In [3]:
# Import Dataset
df = pd.read_csv('data/nursery.data', names=columns )
df.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health,target
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended,recommend
1,usual,proper,complete,1,convenient,convenient,nonprob,priority,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended,recommend
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority,priority


In [4]:
# Describe Shape
df.shape

(12960, 9)

In [5]:
# Identify Target
df['target']

0         recommend
1          priority
2         not_recom
3         recommend
4          priority
            ...    
12955    spec_prior
12956     not_recom
12957    spec_prior
12958    spec_prior
12959     not_recom
Name: target, Length: 12960, dtype: object

In [6]:
# Inspect class balance
df['target'].value_counts()

#could bin it to a binary recc vs no recc but not for this situation

not_recom     4320
priority      4266
spec_prior    4044
very_recom     328
recommend        2
Name: target, dtype: int64

In [7]:
# Split target from predictors

X = df.drop(['target'], axis=1)
y = df['target'] #double brackets makes it a dataframe - don't need it

In [8]:
X.head()

Unnamed: 0,parents,has_nurs,form,children,housing,finance,social,health
0,usual,proper,complete,1,convenient,convenient,nonprob,recommended
1,usual,proper,complete,1,convenient,convenient,nonprob,priority
2,usual,proper,complete,1,convenient,convenient,nonprob,not_recom
3,usual,proper,complete,1,convenient,convenient,slightly_prob,recommended
4,usual,proper,complete,1,convenient,convenient,slightly_prob,priority


In [9]:
y

0         recommend
1          priority
2         not_recom
3         recommend
4          priority
            ...    
12955    spec_prior
12956     not_recom
12957    spec_prior
12958    spec_prior
12959     not_recom
Name: target, Length: 12960, dtype: object

In [10]:
# Train-Test Split with 20% test set, random state 42

from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Let's set X_test, y_test aside and not touch it until the end.

We will build our model on X_train, y_train exclusively.

Perform a secondary train-test split with the same proportion of test size on the train set (.20) and random state 42.

starter code:  
`X_t, X_val, y_t, v_val = `

In [12]:
# Your code here
from sklearn.model_selection import train_test_split

In [13]:
X_t, X_val, y_t, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)

Fit a one hot encoder on the X_t set.

In [14]:
from sklearn.preprocessing import OneHotEncoder

In [15]:
# instantiate instance. Do not drop first here. Make sparse = False 
#sparse true = makes matrix smaller/saves memory/gives a summary ish. keep false to see matrix in full info
ohe = OneHotEncoder(sparse=False)

In [16]:
ohe.__dict__

{'categories': 'auto',
 'sparse': False,
 'dtype': numpy.float64,
 'handle_unknown': 'error',
 'drop': None}

In [17]:
# Fit ohe to X_t
ohe.fit(X_t)

OneHotEncoder(sparse=False)

In [18]:
ohe.__dict__

{'categories': 'auto',
 'sparse': False,
 'dtype': numpy.float64,
 'handle_unknown': 'error',
 'drop': None,
 'categories_': [array(['great_pret', 'pretentious', 'usual'], dtype=object),
  array(['critical', 'improper', 'less_proper', 'proper', 'very_crit'],
        dtype=object),
  array(['complete', 'completed', 'foster', 'incomplete'], dtype=object),
  array(['1', '2', '3', 'more'], dtype=object),
  array(['convenient', 'critical', 'less_conv'], dtype=object),
  array(['convenient', 'inconv'], dtype=object),
  array(['nonprob', 'problematic', 'slightly_prob'], dtype=object),
  array(['not_recom', 'priority', 'recommended'], dtype=object)],
 'drop_idx_': None}

In [19]:
# transform X_t 
X_t_ohe = ohe.transform(X_t)

# Convert to dataframe with columns and indices reinstated
X_t_ohe = pd.DataFrame(X_t_ohe, index=X_t.index, columns=ohe.get_feature_names(X_t.columns))

# hint: use ohe.get_feature_names() as well as X_t.index

In [20]:
X_t_ohe.head()

Unnamed: 0,parents_great_pret,parents_pretentious,parents_usual,has_nurs_critical,has_nurs_improper,has_nurs_less_proper,has_nurs_proper,has_nurs_very_crit,form_complete,form_completed,...,housing_critical,housing_less_conv,finance_convenient,finance_inconv,social_nonprob,social_problematic,social_slightly_prob,health_not_recom,health_priority,health_recommended
2056,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
4853,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
10536,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5689,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1092,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


In [21]:
# Transform X_val
X_val_ohe = ohe.transform(X_val)

In [22]:
# Convert to dataFrame and reattach column names and indices
X_val_ohe = pd.DataFrame(X_val_ohe, index=X_val.index, columns=ohe.get_feature_names(X_val.columns))

In [23]:
X_val_ohe.head()

Unnamed: 0,parents_great_pret,parents_pretentious,parents_usual,has_nurs_critical,has_nurs_improper,has_nurs_less_proper,has_nurs_proper,has_nurs_very_crit,form_complete,form_completed,...,housing_critical,housing_less_conv,finance_convenient,finance_inconv,social_nonprob,social_problematic,social_slightly_prob,health_not_recom,health_priority,health_recommended
7334,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3678,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
6362,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
9738,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
9874,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0


In [24]:
# Fit a decision tree and score it on both X_t, y_t and X_val, y_val

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_t_ohe, y_t)
dt.score(X_t_ohe, y_t) #this score is ACCURACY

1.0

In [26]:
# Score on validation set
dt.score(X_val_ohe, y_val)

0.9942140790742526

In [27]:
# Preprocesses entire training set as one unit 
# (i.e. fit ohe on X_train and convert to Dataframe)
ohe = OneHotEncoder(sparse=False)
X_train_ohe = ohe.fit_transform(X_train)
                   


In [28]:
# Fit to entire training set
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train_ohe, y_train)
dt.score(X_train_ohe, y_train)

1.0

In [29]:
# Preprocesses entire test set (i.e. use ohe to transform entire test set )

X_test_ohe = ohe.transform(X_test)

In [37]:
# Score test set
dt.score(X_test_ohe, y_test) #see why theres a disparity - we know reccomended was dropped since there was only 2 records

0.9984567901234568

In [31]:
dt.predict(X_test_ohe)

array(['not_recom', 'spec_prior', 'priority', ..., 'not_recom',
       'spec_prior', 'not_recom'], dtype=object)

In [36]:
np.unique(dt.predict(X_test_ohe), return_counts=True) 

(array(['not_recom', 'priority', 'spec_prior', 'very_recom'], dtype=object),
 array([870, 871, 785,  66], dtype=int64))