In [21]:
import tensorflow as tf
print("Tensorflow: v" + tf.__version__)

Tensorflow: v2.4.1


## Linear Regression
- Used to predict numeric values 
- eg. predict y values from the x values
- Use line of best fit to predict future values
- However, this works in more than 2 dimension. If you have 3 dimension graph, you can use 2 dimension to get the last one

In [23]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import tensorflow.compat.v2.feature_column as fc

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib

- using titanic dataset from tensorflow
- load into a panda dataframe to view the data

In [24]:
dftrain = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/train.csv")
dfeval = pd.read_csv("https://storage.googleapis.com/tf-datasets/titanic/eval.csv")

y_train = dftrain.pop("survived")
y_eval = dfeval.pop("survived")

# print at specific index
# print(dftrain.loc[0], y_train.loc[0])

`.describe()` to describe the dataset and provide some statistical analysis

In [None]:
dftrain.describe()

In [None]:
dftrain.age.hist(bins=20)

In [None]:
dftrain.sex.value_counts().plot(kind="barh")

In [None]:
dftrain["class"].value_counts().plot(kind="barh")

In [None]:
pd.concat([dftrain, y_train], axis=1).groupby("sex").survived.mean().plot(kind="barh").set_xlabel("% survive")

### Categoric Data vs Numerical Data
- Categoric data should have a certain category that they fall under 
- eg. gender = male or female or deck = first, second or third
- For categoric data, we need to map it to different numeric values eg. male = 0 and female = 1


- Numerical data eg. age or fare

In [25]:
C_COL = ["sex", "n_siblings_spouses", "parch","class", "deck", "embark_town", "alone"]
N_COL = ["age", "fare"]

In [30]:
fc = []
for fn in C_COL:
    vocab = dftrain[fn].unique()
    fc.append(tf.feature_column.categorical_column_with_vocabulary_list(fn, vocab))

for fn in N_COL:
    fc.append(tf.feature_column.numeric_column(fn, dtype=tf.float32))

### Training the Model
- Feed the model information from the dataset
- With large models that have big dataset, need to load in batches
- **epoch** = how many times the model see the same data
- which feed the data again but in a different order. but this may lead to **overfitting**
- to prevent overfitting, start with a low epoch and increament it


- **input function** - define how the data is broken into epoch to feed into the data
- will encode into `tf.data.Dataset` object

In [31]:
def make_input_fn(data_df, label_df, num_epochs=10, shuffle=True, batch_size=32):
    def input_function():
        ds = tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
        if shuffle:
            ds = ds.shuffle(1000)
        ds = ds.batch(batch_size).repeat(num_epochs)
        return ds
    return input_function

train_input_fn = make_input_fn(dftrain, y_train)
eval_input_fn = make_input_fn(dfeval, y_eval, num_epochs=1, shuffle=False)

### Creating the Model

In [32]:
linear_est = tf.estimator.LinearClassifier(feature_columns=fc)
linear_est.train(train_input_fn)
result = linear_est.evaluate(eval_input_fn)
clear_output()
print(result)  # evaluation results

{'accuracy': 0.7462121, 'accuracy_baseline': 0.625, 'auc': 0.8340985, 'auc_precision_recall': 0.77782816, 'average_loss': 0.47410694, 'label/mean': 0.375, 'loss': 0.46557003, 'precision': 0.65686274, 'prediction/mean': 0.37962875, 'recall': 0.67676765, 'global_step': 200}


### Using the Model to make Predictions

In [33]:
results = list(linear_est.predict(eval_input_fn))
clear_output()
print(result)  # actual prediction results

{'accuracy': 0.7462121, 'accuracy_baseline': 0.625, 'auc': 0.8340985, 'auc_precision_recall': 0.77782816, 'average_loss': 0.47410694, 'label/mean': 0.375, 'loss': 0.46557003, 'precision': 0.65686274, 'prediction/mean': 0.37962875, 'recall': 0.67676765, 'global_step': 200}
