In [1]:
import pandas as pd
import tensorflow as tf

In [2]:
# show only errors in logs
tf.logging.set_verbosity(tf.logging.ERROR)

## Reading data

In [3]:
profiles = pd.read_csv('profiles_clustered.csv')
profiles.head()

Unnamed: 0,gender,goal,level,cluster
0,m,fit,middle,2
1,f,lose,begin,4
2,f,fit,middle,6
3,m,muscle,begin,1
4,m,muscle,advance,3


Now we need to save real data separately (column cluster) and remove them from training model

In [4]:
y_real = profiles['cluster']

In [5]:
# drop cluster column
x_data = profiles.drop('cluster', axis=1)

Train/test data split

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data, y_real, test_size=0.3, random_state = 101)

Next step is to define feature columns

In [7]:
gender = tf.feature_column.categorical_column_with_vocabulary_list("gender", ["f", "m"])
goal = tf.feature_column.categorical_column_with_vocabulary_list("goal", ["fit", "lose", "muscle"])
level = tf.feature_column.categorical_column_with_vocabulary_list("level", ["begin", "middle", "advance"])

feature_columns = [gender,goal,level]

## Train Model

In [8]:
input_fn = tf.estimator.inputs.pandas_input_fn(x=x_train, y=y_train, batch_size=100, num_epochs=None, shuffle=True)

The dataset is pretty simpe, therefore we use Linear Classifier for training model

In [9]:
n_classes = 18
model = tf.estimator.LinearClassifier(feature_columns=feature_columns, n_classes=n_classes)

In [10]:
model.train(input_fn=input_fn, steps=100)

<tensorflow.python.estimator.canned.linear.LinearClassifier at 0x29d685c14e0>

## Evaluate Model
Now it's time to evaluate the model with test data

In [11]:
predict_func = tf.estimator.inputs.pandas_input_fn(x=x_test, batch_size=len(x_test), shuffle=False)

In [12]:
predictions = list(model.predict(input_fn=predict_func))

In [13]:
final_predictions = []
for pred in predictions:
    final_predictions.append(pred['class_ids'][0])

In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test,final_predictions))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00        54
          1       1.00      1.00      1.00        63
          2       1.00      1.00      1.00        68
          3       1.00      1.00      1.00        42
          4       1.00      1.00      1.00        68
          5       1.00      1.00      1.00        36
          6       1.00      1.00      1.00        54
          7       1.00      1.00      1.00        24
          8       1.00      1.00      1.00        35
          9       1.00      1.00      1.00        36
         10       1.00      1.00      1.00        14
         11       1.00      1.00      1.00        20
         12       1.00      1.00      1.00        35
         13       1.00      1.00      1.00        19
         14       1.00      1.00      1.00        20
         15       1.00      1.00      1.00        17
         16       1.00      1.00      1.00        16
         17       1.00      1.00      1.00   

## Test Model
In order to check that model works in a way it's supposed to work, let's create a single entry and predict the cluster

In [15]:
test_entry = pd.DataFrame(columns=['gender','goal', 'level'])
test_entry.loc[0] = list(pd.Series(['f','fit','middle']))
test_entry.head()

Unnamed: 0,gender,goal,level
0,f,fit,middle


In [16]:
test_func = tf.estimator.inputs.pandas_input_fn(x=test_entry, batch_size=len(test_entry), shuffle=False)
test_result = list(model.predict(input_fn=test_func))

In [17]:
print("Predicted cluster: ", test_result[0]['class_ids'][0])

Predicted cluster:  6


## Export model

We are using the same feature columns for the model export

In [18]:
feature_spec = tf.feature_column.make_parse_example_spec(feature_columns)
serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec)

In [19]:
model_path = model.export_savedmodel(export_dir_base="model", serving_input_receiver_fn=serving_input_fn)

Now let's load exported model

In [20]:
saved_model_predictor = tf.contrib.predictor.from_saved_model(export_dir=model_path)

And try to predict the cluster. The cluster with the highest probability (score) is the predicted cluster.

In [21]:
model_input = tf.train.Example(features=tf.train.Features(feature={
        'gender': tf.train.Feature(bytes_list=tf.train.BytesList(value=['m'.encode('utf-8')])),
        'goal': tf.train.Feature(bytes_list=tf.train.BytesList(value=['muscle'.encode('utf-8')])),
        'level': tf.train.Feature(bytes_list=tf.train.BytesList(value=['middle'.encode('utf-8')])),
        })) 

model_input = model_input.SerializeToString()
output_dict = saved_model_predictor({"inputs": [model_input]})

print(output_dict)

{'classes': array([[b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7', b'8', b'9',
        b'10', b'11', b'12', b'13', b'14', b'15', b'16', b'17']],
      dtype=object), 'scores': array([[1.9202249e-03, 5.2149370e-02, 5.7709269e-02, 4.4203456e-02,
        4.3400018e-05, 9.2795852e-04, 1.6863841e-03, 3.5711149e-05,
        8.4892521e-04, 5.5755168e-05, 7.6825923e-01, 9.6829806e-04,
        3.5396297e-05, 2.2049170e-02, 4.4690583e-02, 1.6535844e-03,
        1.5526811e-03, 1.2106375e-03]], dtype=float32)}
