In [1]:
import os
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

In [10]:
import tensorflow.compat.v2.feature_column as fc

import tensorflow as tf
df = pd.read_csv('transformed_data.csv')
df = df[['Sales Year', 'Make', 'Category', 'IsDevelopment',
       'Total VIO', 'PartsAuthority', 'SSF',
       'Worldpac', 'PartsAuthority Sales count', 'SSF Sales count',
       'WorldPac Sales count','Total Units Sold Sales count', 'SalesYearRank']]

In [11]:
# Load dataset.

dftrain = df[df['Sales Year'] < 2020]
dfeval = df[df['Sales Year'] == 2020]
y_train = dftrain.pop('Total Units Sold Sales count')
y_eval = dfeval.pop('Total Units Sold Sales count')

In [12]:
dftrain.head()

Unnamed: 0,Sales Year,Make,Category,IsDevelopment,Total VIO,PartsAuthority,SSF,Worldpac,PartsAuthority Sales count,SSF Sales count,WorldPac Sales count,SalesYearRank
0,2016,Mercedes-Benz,Cooling,0,1798416,1,0,1,79.0,0.0,0.0,1
1,2017,Mercedes-Benz,Cooling,0,1798416,1,0,1,33.0,0.0,0.0,2
2,2018,Mercedes-Benz,Cooling,0,1798416,1,0,1,49.0,0.0,0.0,3
3,2019,Mercedes-Benz,Cooling,0,1798416,1,0,1,197.0,0.0,0.0,4
5,2016,Mercedes-Benz,Cooling,0,73430,1,0,1,0.0,0.0,0.0,5


In [13]:
dftrain.describe()

Unnamed: 0,Sales Year,IsDevelopment,Total VIO,PartsAuthority,SSF,Worldpac,PartsAuthority Sales count,SSF Sales count,WorldPac Sales count,SalesYearRank
count,17816.0,17816.0,17816.0,17816.0,17816.0,17816.0,17816.0,17816.0,17816.0,17816.0
mean,2017.679277,0.107712,214063.8,0.627245,0.35137,0.583184,38.400202,23.109508,84.801583,8.058543
std,1.118903,0.310025,651260.4,0.483551,0.477412,0.493046,118.908775,88.950405,489.008691,5.830335
min,2016.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-2.0
25%,2017.0,0.0,17834.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
50%,2018.0,0.0,53476.0,1.0,0.0,1.0,3.5,0.0,0.0,7.0
75%,2019.0,0.0,162216.0,1.0,1.0,1.0,35.0,17.0,31.0,12.0
max,2019.0,1.0,22034270.0,1.0,1.0,1.0,3594.0,5140.0,20668.0,21.0


In [14]:
dftrain.shape[0], dfeval.shape[0]

(17816, 5543)

In [23]:
CATEGORICAL_COLUMNS = ['Make', 'Category']
NUMERIC_COLUMNS = ['Sales Year', 'IsDevelopment', 'Total VIO', 'PartsAuthority', 'SSF',
       'Worldpac', 'PartsAuthority Sales count', 'SSF Sales count',
       'WorldPac Sales count','SalesYearRank']

feature_columns = []
for feature_name in CATEGORICAL_COLUMNS:
  vocabulary = dftrain[feature_name].unique()
  feature_columns.append(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocabulary))

for feature_name in NUMERIC_COLUMNS:
  feature_columns.append(tf.feature_column.numeric_column(feature_name, dtype=tf.float32))

In [24]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
  def input_function():
    ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
    if shuffle:
      ds = ds.shuffle(1000)
    ds = ds.batch(batch_size).repeat(num_epochs)
    return ds
  return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

In [25]:
ds = make_input_fn(dftrain, y_train, batch_size=10)()
for feature_batch, label_batch in ds.take(1):
  print('Some feature keys:', list(feature_batch.keys()))
  print()
  print('A batch of class:', feature_batch['Category'].numpy())
  print()
  print('A batch of Labels:', label_batch.numpy())

Some feature keys: ['Sales Year', 'Make', 'Category', 'IsDevelopment', 'Total VIO', 'PartsAuthority', 'SSF', 'Worldpac', 'PartsAuthority Sales count', 'SSF Sales count', 'WorldPac Sales count', 'SalesYearRank']

A batch of class: [b'Electrical' b'Exhaust' b'Suspension' b'Electrical' b'Electrical'
 b'Electrical' b'Electrical' b'Electrical' b'Suspension' b'Electrical']

A batch of Labels: [ 41. 363. 113. 315.  21.  92.  91.  23. 205. 797.]


In [26]:
make_column = feature_columns[3]
tf.keras.layers.DenseFeatures([make_column])(feature_batch).numpy()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]], dtype=float32)

In [27]:
category_column = feature_columns[1]
tf.keras.layers.DenseFeatures([tf.feature_column.indicator_column(category_column)])(feature_batch).numpy()



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]],
      dtype=float32)

In [28]:
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Elly-371\\AppData\\Local\\Temp\\tmp3jexr4mb', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.


To change all layers to have dtype float64 

InvalidArgumentError: assertion failed: [Labels must be <= n_classes - 1] [Condition x <= y did not hold element-wise:] [x (head/losses/Cast:0) = ] [[83][985][131]...] [y (head/losses/check_label_range/Const:0) = ] [1]
	 [[{{node Assert}}]]

In [None]:
sales_year_x_category = tf.feature_column.crossed_column(['Sales Year', 'Category'], hash_bucket_size=100)

In [None]:
derived_feature_columns = [age_x_gender]
linear_est = tf.estimator.LinearClassifier(feature_columns=feature_columns+derived_feature_columns)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)

clear_output()
print(result)

In [None]:
pred_dicts = list(linear_est.predict(eval_input_fn))
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])

probs.plot(kind='hist', bins=20, title='predicted probabilities')

In [None]:
from sklearn.metrics import roc_curve
from matplotlib import pyplot as plt

fpr, tpr, _ = roc_curve(y_eval, probs)
plt.plot(fpr, tpr)
plt.title('ROC curve')
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
plt.xlim(0,)
plt.ylim(0,)