### Example 16: One-Hot-Encoding

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# read data
df = pd.read_csv('../Data/starting_salaries.csv')
df.head(5)

Unnamed: 0,GPA,unemployment,major,salary
0,2.6,6.1,Eng,53100.0
1,2.6,5.6,CS,68700.0
2,2.8,6.5,CS,72800.0
3,2.8,5.0,CS,72100.0
4,3.1,6.1,MA,74800.0


3 features for predicting salary:
1. GPA
2. Unemployment
3. Major

Major is a word, so we need to use one-hot encoding to change these into numbers

In [10]:
# standardize the data
df_s = df.copy()
df_s['GPA'] = (df_s['GPA'] - df_s['GPA'].mean()) / df_s['GPA'].std()
df_s['unemployment'] = (df_s['unemployment'] - df_s['unemployment'].mean()) / df_s['unemployment'].std()
df_s['salary'] = (df_s['salary'] - df_s['salary'].mean()) / df_s['salary'].std()

In [11]:
# one-hot-encode major
df_h = pd.get_dummies(df_s)
df_h.head()

Unnamed: 0,GPA,unemployment,salary,major_CS,major_Eng,major_MA
0,-0.587574,0.049555,-1.188463,0.0,1.0,0.0
1,-0.587574,-0.517435,0.182222,1.0,0.0,0.0
2,-0.048515,0.503147,0.542466,1.0,0.0,0.0
3,-0.048515,-1.197824,0.480961,1.0,0.0,0.0
4,0.760073,0.049555,0.718195,0.0,0.0,1.0


Now the model looks like
    
    y =   w_1 * GPA
        + w_2 * unemployment
        + w_3 * major_CS
        + w_4 * major_Eng
        + w_5 * major_MA
        
When the model is trained, the coefficients tell us how important each category predicts the salary

In [13]:
# convert data to single precision
GPA = df_h.GPA.values.astype(np.float32)
unemployment = df_h.unemployment.values.astype(np.float32)
salary = df_h.salary.values.astype(np.float32)
major_CS = df_h.major_CS.values.astype(np.float32)
major_Eng = df_h.major_Eng.values.astype(np.float32)
major_MA = df_h.major_MA.values.astype(np.float32)

In [14]:
# define TensorFlow computation graph

# define input and output data
x1 = tf.constant(GPA)
x2 = tf.constant(unemployment)
x3 = tf.constant(major_CS)
x4 = tf.constant(major_Eng)
x5 = tf.constant(major_MA)
y = tf.constant(salary)

# define feature weights and bias with initial values
w1 = tf.Variable(1.0)
w2 = tf.Variable(1.0)
w3 = tf.Variable(1.0)
w4 = tf.Variable(1.0)
w5 = tf.Variable(1.0)
b  = tf.Variable(1.0)

# define error (loss) function as MSE 
MSE = tf.reduce_mean(tf.square(w1*x1 + w2*x2 + w3*x3 + w4*x4 + w5*x5 + b - y))

# define optimizer
STEPSIZE = 0.1
optimizer = tf.train.GradientDescentOptimizer(STEPSIZE).minimize(MSE)

# make predictions
y_pred = w1*x1 + w2*x2 + w3*x3 + w4*x4 + w5*x5 + b

# initialization routine
init = tf.initialize_all_variables()

NameError: name 'tf' is not defined

In [None]:
# create a graph session and initialize it
sess = tf.Session()
sess.run(init)

In [None]:
# minimize MSE
MAXSTEPS = 100
for step in range(MAXSTEPS+1):
    (_,MSEo) = sess.run([optimizer,MSE])
    if (step % 10) == 0:
        print('step = %-5d MSE = %-10f' %  (step,MSEo))
print('done !')

# make predictions
salary_s_pred = sess.run(y_pred)

In [None]:
# unstandardize weight predictions
salary_pred = salary_s_pred*df.salary.std() + df.salary.mean()

In [None]:
# create new prediction column in dataframe df
df['salary_pred'] = salary_pred.round()
df.head()

In [None]:
# compute mean square error(
mse = ((df.salary.values - salary_pred)**2).mean()
print('MSE = ',mse)
print('RMSE = ',np.sqrt(mse))

### Example 19 (Numpy Array Indexing)

In [15]:
import numpy as np

In [16]:
a = np.arange(1,13)
a

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

In [17]:
a[::-1]

array([12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [18]:
A = a.reshape((3,4))
A

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12]])

In [19]:
print(A[2,1])

10


In [20]:
print(A[0:2,1:3])

[[2 3]
 [6 7]]


In [21]:
print(A[:,::2])

[[ 1  3]
 [ 5  7]
 [ 9 11]]


In [22]:
print(A[:,::-1])

[[ 4  3  2  1]
 [ 8  7  6  5]
 [12 11 10  9]]
