## Encoding a Feature Vector

In [1]:
import pandas as pd

df = pd.read_csv(
    "https://data.heatonresearch.com/data/t81-558/jh-simple-dataset.csv",
    na_values=['NA','?'])

pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 5)

display(df)

Unnamed: 0,id,job,area,income,...,pop_dense,retail_dense,crime,product
0,1,vv,c,50876.0,...,0.885827,0.492126,0.071100,b
1,2,kd,c,60369.0,...,0.874016,0.342520,0.400809,c
...,...,...,...,...,...,...,...,...,...
1998,1999,qp,c,67949.0,...,0.909449,0.598425,0.117803,c
1999,2000,pe,c,61467.0,...,0.925197,0.539370,0.451973,c


## To begin with, we will convert the job code into dummy variables.

In [2]:
dummies = pd.get_dummies(df['job'],prefix="job")
print(dummies.shape)

pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 10)

display(dummies)

(2000, 33)


Unnamed: 0,job_11,job_al,job_am,job_ax,...,job_rn,job_sa,job_vv,job_zz
0,0,0,0,0,...,0,0,1,0
1,0,0,0,0,...,0,0,0,0
2,0,0,0,0,...,0,0,0,0
3,1,0,0,0,...,0,0,0,0
4,0,0,0,0,...,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,...,0,0,1,0
1996,0,0,0,0,...,0,0,0,0
1997,0,0,0,0,...,0,0,0,0
1998,0,0,0,0,...,0,0,0,0


In [3]:
df = pd.concat([df,dummies],axis=1)
df.drop('job', axis=1, inplace=True)

pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 10)

display(df)

Unnamed: 0,id,area,income,aspect,...,job_rn,job_sa,job_vv,job_zz
0,1,c,50876.0,13.100000,...,0,0,1,0
1,2,c,60369.0,18.625000,...,0,0,0,0
2,3,c,55126.0,34.766667,...,0,0,0,0
3,4,c,51690.0,15.808333,...,0,0,0,0
4,5,d,28347.0,40.941667,...,0,0,0,0
...,...,...,...,...,...,...,...,...,...
1995,1996,c,51017.0,38.233333,...,0,0,1,0
1996,1997,d,26576.0,33.358333,...,0,0,0,0
1997,1998,d,28595.0,39.425000,...,0,0,0,0
1998,1999,c,67949.0,5.733333,...,0,0,0,0


## We also introduce dummy variables for the area column.

In [4]:
df = pd.concat([df,pd.get_dummies(df['area'],prefix="area")],axis=1)
df.drop('area', axis=1, inplace=True)

pd.set_option('display.max_columns', 9)
pd.set_option('display.max_rows', 10)
display(df)

Unnamed: 0,id,income,aspect,subscriptions,...,area_a,area_b,area_c,area_d
0,1,50876.0,13.100000,1,...,0,0,1,0
1,2,60369.0,18.625000,2,...,0,0,1,0
2,3,55126.0,34.766667,1,...,0,0,1,0
3,4,51690.0,15.808333,1,...,0,0,1,0
4,5,28347.0,40.941667,3,...,0,0,0,1
...,...,...,...,...,...,...,...,...,...
1995,1996,51017.0,38.233333,1,...,0,0,1,0
1996,1997,26576.0,33.358333,2,...,0,0,0,1
1997,1998,28595.0,39.425000,3,...,0,0,0,1
1998,1999,67949.0,5.733333,0,...,0,0,1,0



## The last remaining transformation is to fill in missing income values.

In [5]:
med = df['income'].median()
df['income'] = df['income'].fillna(med)

In [6]:
print(list(df.columns))


['id', 'income', 'aspect', 'subscriptions', 'dist_healthy', 'save_rate', 'dist_unhealthy', 'age', 'pop_dense', 'retail_dense', 'crime', 'product', 'job_11', 'job_al', 'job_am', 'job_ax', 'job_bf', 'job_by', 'job_cv', 'job_de', 'job_dz', 'job_e2', 'job_f8', 'job_gj', 'job_gv', 'job_kd', 'job_ke', 'job_kl', 'job_kp', 'job_ks', 'job_kw', 'job_mm', 'job_nb', 'job_nn', 'job_ob', 'job_pe', 'job_po', 'job_pq', 'job_pz', 'job_qp', 'job_qw', 'job_rn', 'job_sa', 'job_vv', 'job_zz', 'area_a', 'area_b', 'area_c', 'area_d']


In [7]:
x_columns = df.columns.drop('product').drop('id')
print(list(x_columns))

['income', 'aspect', 'subscriptions', 'dist_healthy', 'save_rate', 'dist_unhealthy', 'age', 'pop_dense', 'retail_dense', 'crime', 'job_11', 'job_al', 'job_am', 'job_ax', 'job_bf', 'job_by', 'job_cv', 'job_de', 'job_dz', 'job_e2', 'job_f8', 'job_gj', 'job_gv', 'job_kd', 'job_ke', 'job_kl', 'job_kp', 'job_ks', 'job_kw', 'job_mm', 'job_nb', 'job_nn', 'job_ob', 'job_pe', 'job_po', 'job_pq', 'job_pz', 'job_qp', 'job_qw', 'job_rn', 'job_sa', 'job_vv', 'job_zz', 'area_a', 'area_b', 'area_c', 'area_d']


## Generate X and Y for a Classification Neural Network

In [8]:
# Convert to numpy - Classification
x_columns = df.columns.drop('product').drop('id')
x = df[x_columns].values
dummies = pd.get_dummies(df['product']) # Classification
products = dummies.columns
y = dummies.values

In [9]:
y

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0]], dtype=uint8)