In [0]:
#https://www.tensorflow.org/tutorials/load_data/csv
#This tutorial provides an example of how to load CSV data from a file into a tf.data.Dataset.

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# TensorFlow and tf.keras
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf
from tensorflow import keras
import numpy as np

In [0]:
import functools

In [0]:
#we have not download any data, its just getting the links and paths to data.
#but it seems we have, the folder is just hidden?
#Note: the csv is as any csv, columns and rows with data 
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)


In [0]:
# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True) #Just setting some options


In [42]:
!head {train_file_path} #checking the head of the data
#My theory is that this commands take the data from the url directly



survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


You can load this using pandas, and pass the NumPy arrays to TensorFlow. If you need to scale up to a large set of files, or need a loader that integrates with TensorFlow and tf.data then use the tf.data.experimental.make_csv_dataset function:![alt text](https://)

In [0]:
#The only column you need to identify explicitly is the one with the value 
#that the model is intended to predict. 
LABEL_COLUMN = 'survived'
LABELS = [0, 1]


In [0]:
#Now read the CSV data from the file and create a dataset. 
def get_dataset(file_path, **kwargs):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=5, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True, 
      **kwargs)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)


In [45]:
#interesting, this creates a PrefetchDataset from the csv, 
raw_train_data

<PrefetchDataset shapes: (OrderedDict([(sex, (None,)), (age, (None,)), (n_siblings_spouses, (None,)), (parch, (None,)), (fare, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,))]), (None,)), types: (OrderedDict([(sex, tf.string), (age, tf.float32), (n_siblings_spouses, tf.int32), (parch, tf.int32), (fare, tf.float32), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string)]), tf.int32)>

In [46]:
"""
Each item in the dataset is a batch, represented as a tuple of (many examples, many labels). 
The data from the examples is organized in column-based tensors (rather than row-based tensors), 
each with as many elements as the batch size (5 in this case).
"""
def show_batch(dataset):
  for batch, label in dataset.take(1):
    for key, value in batch.items():
      print("{:20s}: {}".format(key,value.numpy()))

show_batch(raw_train_data) #Basically every 'column' has 5 rows of data.


sex                 : [b'female' b'female' b'male' b'male' b'female']
age                 : [28.  42.  22.  19.  14.5]
n_siblings_spouses  : [0 1 0 0 1]
parch               : [2 0 0 0 0]
fare                : [22.358 26.     8.05   7.65  14.454]
class               : [b'Third' b'Second' b'Third' b'Third' b'Third']
deck                : [b'unknown' b'unknown' b'unknown' b'F' b'unknown']
embark_town         : [b'Cherbourg' b'Southampton' b'Southampton' b'Southampton' b'Cherbourg']
alone               : [b'n' b'n' b'y' b'y' b'n']


NOTE:
the columns in the CSV are named. The dataset constructor will pick these names up automatically. If the file you are working with does not contain the column names in the first line, pass them in a list of strings to the column_names argument in the make_csv_dataset function.

In [47]:
CSV_COLUMNS = ['survived', 'sex', 'age', 'n_siblings_spouses', 'parch', 'fare', 'class', 'deck', 'embark_town', 'alone']
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)
show_batch(temp_dataset)


sex                 : [b'female' b'female' b'male' b'male' b'female']
age                 : [19.  58.  70.5  4.  15. ]
n_siblings_spouses  : [0 0 0 3 0]
parch               : [0 1 0 2 0]
fare                : [ 30.    153.462   7.75   27.9     7.225]
class               : [b'First' b'First' b'Third' b'Third' b'Third']
deck                : [b'B' b'C' b'unknown' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Southampton' b'Queenstown' b'Southampton' b'Cherbourg']
alone               : [b'y' b'n' b'y' b'n' b'y']


#Data preprocessing
A CSV file can contain a variety of data types. Typically you want to convert from those mixed types to a fixed length vector before feeding the data into your model.

TensorFlow has a built-in system for describing common input conversions: tf.feature_column (see https://www.tensorflow.org/tutorials/structured_data/feature_columns for more info [another tut, we will get there] )

You can preprocess your data using any tool you like (like nltk or sklearn), and just pass the processed output to TensorFlow.

The primary advantage of doing the preprocessing inside your model is that when you export the model it includes the preprocessing. This way you can pass the raw data directly to your model.

**Continuous data**

(If you have mixed datatypes you may want to separate out these simple-numeric fields. The tf.feature_column api can handle them, but this incurs some overhead and should be avoided unless really necessary)

If your data is already in an appropriate numeric format, you can pack the data into a vector before passing it off to the model.

(Basically everything that is not a string in this case)

In [48]:
SELECT_COLUMNS = ['survived', 'age', 'n_siblings_spouses', 'parch', 'fare']
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(train_file_path, 
                           select_columns=SELECT_COLUMNS,
                           column_defaults = DEFAULTS)

show_batch(temp_dataset)


age                 : [31. 16. 28. 31. 28.]
n_siblings_spouses  : [1. 0. 0. 1. 0.]
parch               : [1. 0. 0. 1. 0.]
fare                : [37.004  7.733  0.    20.525  0.   ]


In [49]:
example_batch, labels_batch = next(iter(temp_dataset)) #take the next 5 values (the next batch)
example_batch #Note, show batch does not work with this one


OrderedDict([('age',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([43., 34., 27., 21.,  7.], dtype=float32)>),
             ('n_siblings_spouses',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 0., 0., 0., 4.], dtype=float32)>),
             ('parch',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([1., 1., 0., 0., 1.], dtype=float32)>),
             ('fare',
              <tf.Tensor: shape=(5,), dtype=float32, numpy=array([26.25 , 23.   ,  7.896, 77.958, 29.125], dtype=float32)>)])

In [50]:
#Here's a simple function that will pack together all the columns:
def pack(features, label):
  return tf.stack(list(features.values()), axis=-1), label

#Apply this to each element of the dataset:
packed_dataset = temp_dataset.map(pack)

for features, labels in packed_dataset.take(1):
  print(features.numpy())
  print()
  print(labels.numpy())

#returns rows with the values (age, n_siblings...)
#And its label

[[25.     0.     0.     7.65 ]
 [34.     1.     1.    32.5  ]
 [26.     2.     0.     8.663]
 [32.     0.     0.    30.5  ]
 [38.     0.     0.    80.   ]]

[0 1 0 1 1]


In [0]:
#Since our data is mixed (numerical and non-numerical)
# we define a preprocessor that selects a list of numeric features and packs them into a single column:
class PackNumericFeatures(object):
  def __init__(self, names):
    self.names = names

  def __call__(self, features, labels):
    numeric_features = [features.pop(name) for name in self.names] #get each numerical feature and remove from list
    numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features] #make them float32?
    numeric_features = tf.stack(numeric_features, axis=-1) #https://www.tensorflow.org/api_docs/python/tf/stack Basically merges them into one list
    features['numeric'] = numeric_features

    return features, labels

NUMERIC_FEATURES = ['age','n_siblings_spouses','parch', 'fare']

packed_train_data = raw_train_data.map( #Remember .map passes the function to each element of the dataset
    PackNumericFeatures(NUMERIC_FEATURES))

packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES))


In [52]:
packed_train_data

<MapDataset shapes: (OrderedDict([(sex, (None,)), (class, (None,)), (deck, (None,)), (embark_town, (None,)), (alone, (None,)), (numeric, (None, 4))]), (None,)), types: (OrderedDict([(sex, tf.string), (class, tf.string), (deck, tf.string), (embark_town, tf.string), (alone, tf.string), (numeric, tf.float32)]), tf.int32)>

In [53]:
show_batch(packed_train_data) #show batch works becaused packed_train_data is a collection of batches, it shows one of them
#It basically made all the numeric data a single 'category' with its 'columns' inside its dimentionality.


sex                 : [b'female' b'male' b'female' b'male' b'male']
class               : [b'Third' b'Third' b'Second' b'Second' b'Third']
deck                : [b'unknown' b'unknown' b'E' b'unknown' b'unknown']
embark_town         : [b'Southampton' b'Cherbourg' b'Southampton' b'Southampton' b'Southampton']
alone               : [b'n' b'y' b'y' b'y' b'y']
numeric             : [[25.     1.     0.     7.925]
 [34.5    0.     0.     6.438]
 [27.     0.     0.    10.5  ]
 [28.     0.     0.    10.5  ]
 [55.5    0.     0.     8.05 ]]


In [0]:
example_batch, labels_batch = next(iter(packed_train_data)) 

In [55]:
example_batch

OrderedDict([('sex',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'male', b'female', b'female', b'female', b'female'], dtype=object)>),
             ('class',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'Second', b'Third', b'Third', b'Second', b'First'], dtype=object)>),
             ('deck',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'unknown', b'unknown', b'unknown', b'unknown', b'C'], dtype=object)>),
             ('embark_town', <tf.Tensor: shape=(5,), dtype=string, numpy=
              array([b'Southampton', b'Queenstown', b'Southampton', b'Southampton',
                     b'Southampton'], dtype=object)>),
             ('alone',
              <tf.Tensor: shape=(5,), dtype=string, numpy=array([b'n', b'y', b'n', b'y', b'n'], dtype=object)>),
             ('numeric', <tf.Tensor: shape=(5, 4), dtype=float32, numpy=
              array([[ 44.   ,   1.   ,   0.   ,  26.   ],
                     [ 21.   ,   0.   ,   0. 

**Data Normalization**

Continuous data should always be normalized.
The continuous data are the numbers...

In [56]:
import pandas as pd
desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()
desc

Unnamed: 0,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0
mean,29.631308,0.545455,0.379585,34.385399
std,12.511818,1.15109,0.792999,54.59773
min,0.75,0.0,0.0,0.0
25%,23.0,0.0,0.0,7.8958
50%,28.0,0.0,0.0,15.0458
75%,35.0,1.0,0.0,31.3875
max,80.0,8.0,5.0,512.3292


In [0]:
#Simple mean and std normalization
MEAN = np.array(desc.T['mean']) #desc.T[mean] is the format of numpy to make the mean of each column
STD = np.array(desc.T['std'])
def normalize_numeric_data(data, mean, std):
  # Center the data
  return (data-mean)/std


In [58]:
#Now create a numeric column. 
#The tf.feature_columns.numeric_column API accepts a normalizer_fn argument, which will be run on each batch.
#This is kinof like a .map function but in numpy and it passes the arguments to every batch and to all data??

# See what you just created.
normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column('numeric', normalizer_fn=normalizer, shape=[len(NUMERIC_FEATURES)])
numeric_columns = [numeric_column]
numeric_column
#I think numeric column gets the mean and std data and uses the normalizer function on every column.
#It returns the normalized data as np arrays.

NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x7ff7ed729620>, mean=array([29.631,  0.545,  0.38 , 34.385]), std=array([12.512,  1.151,  0.793, 54.598])))

In [59]:
#When you train the model, include this feature column to select and center this block of numeric data:
example_batch['numeric']


<tf.Tensor: shape=(5, 4), dtype=float32, numpy=
array([[ 44.   ,   1.   ,   0.   ,  26.   ],
       [ 21.   ,   0.   ,   0.   ,   7.75 ],
       [  9.   ,   4.   ,   2.   ,  31.275],
       [ 35.   ,   0.   ,   0.   ,  21.   ],
       [ 58.   ,   0.   ,   1.   , 153.462]], dtype=float32)>

In [60]:
#Now we get the data normalized
numeric_layer = tf.keras.layers.DenseFeatures(numeric_columns) #Make a layer that normalizes
#We are passing the unprocessed data to the model and adding a layer to it that does the processing inside.
numeric_layer(example_batch).numpy()

#Note: The mean based normalization used here requires knowing the means of each column ahead of time.


array([[ 1.148,  0.395, -0.479, -0.154],
       [-0.69 , -0.474, -0.479, -0.488],
       [-1.649,  3.001,  2.043, -0.057],
       [ 0.429, -0.474, -0.479, -0.245],
       [ 2.267, -0.474,  0.782,  2.181]], dtype=float32)

#Categorical data

Some of the columns in the CSV data are categorical columns. That is, the content should be one of a limited set of options.

Use the tf.feature_column API to create a collection with a tf.feature_column.indicator_column for each categorical column.

In [0]:
#Dictionary? of possible categories
CATEGORIES = {
    'sex': ['male', 'female'],
    'class' : ['First', 'Second', 'Third'],
    'deck' : ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'],
    'embark_town' : ['Cherbourg', 'Southhampton', 'Queenstown'],
    'alone' : ['y', 'n']
}


In [0]:
#This is to pass the values on the dataset to the ones in the dictionary
#Does this count as cleaning?
categorical_columns = []
for feature, vocab in CATEGORIES.items():
  cat_col = tf.feature_column.categorical_column_with_vocabulary_list( #Note all this are legit keras layers https://www.tensorflow.org/api_docs/python/tf/feature_column/categorical_column_with_vocabulary_list
        key=feature, vocabulary_list=vocab) #Note: if you hover over the function google colab shows you info of it
  categorical_columns.append(tf.feature_column.indicator_column(cat_col))


In [63]:
categorical_columns

[IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='class', vocabulary_list=('First', 'Second', 'Third'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='deck', vocabulary_list=('A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='embark_town', vocabulary_list=('Cherbourg', 'Southhampton', 'Queenstown'), dtype=tf.string, default_value=-1, num_oov_buckets=0)),
 IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key='alone', vocabulary_list=('y', 'n'), dtype=tf.string, default_value=-1, num_oov_buckets=0))]

In [64]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)
print(categorical_layer(example_batch).numpy()[0])
#The 1 and zeroes are the categories converted to a single array.


[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]


#Combined preprocessing layer

Add the two feature column collections and pass them to a tf.keras.layers.DenseFeatures to create an input layer that will extract and preprocess both input types:

In [68]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)
print(preprocessing_layer(example_batch).numpy()[0])
#Basically our input is now this, the normalized continous data and the categories

[ 0.     1.     0.     1.     0.     0.     0.     0.     0.     0.
  0.     0.     0.     0.     0.     0.     0.     0.     1.148  0.395
 -0.479 -0.154  1.     0.   ]


In [70]:
#the 5 means there are 5 batches.
preprocessing_layer(example_batch)

<tf.Tensor: shape=(5, 24), dtype=float32, numpy=
array([[ 0.   ,  1.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  1.148,  0.395, -0.479, -0.154,  1.   ,  0.   ],
       [ 1.   ,  0.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  1.   , -0.69 , -0.474, -0.479, -0.488,  0.   ,  1.   ],
       [ 0.   ,  1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   , -1.649,  3.001,  2.043, -0.057,  0.   ,  1.   ],
       [ 1.   ,  0.   ,  0.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,  0.   ,
         0.   ,  0.   ,  0.429, -0.474, -0.479, -0.245,  0.   ,  1.   ],
       [ 0.   ,  1.   ,  1.   ,  0.   ,  0.   ,  0.   ,  0.   ,  1.   ,
         0.

#Build the model

Build a tf.keras.Sequential, starting with the preprocessing_layer.

In [0]:
model = tf.keras.Sequential([
  preprocessing_layer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(1, activation='sigmoid'), #Activation sigmoid: if it survived or not?
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])


#Train, evaluate, and predict

Now the model can be instantiated and trained.

In [77]:
train_data = packed_train_data.shuffle(500)
test_data = packed_test_data
model.fit(train_data, epochs=20)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff7ed4f0b70>

In [78]:
#Output shape = multiple? what is that
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_10 (DenseFeat multiple                  0         
_________________________________________________________________
dense_9 (Dense)              multiple                  3200      
_________________________________________________________________
dense_10 (Dense)             multiple                  16512     
_________________________________________________________________
dense_11 (Dense)             multiple                  129       
Total params: 19,841
Trainable params: 19,841
Non-trainable params: 0
_________________________________________________________________


In [79]:
test_loss, test_accuracy = model.evaluate(test_data)

print('\n\nTest Loss {}, Test Accuracy {}'.format(test_loss, test_accuracy))


     53/Unknown - 0s 8ms/step - loss: 0.4333 - accuracy: 0.8523

Test Loss 0.43330214608390377, Test Accuracy 0.8522727489471436


In [0]:
#Use tf.keras.Model.predict to infer labels on a batch or a dataset of batches.
predictions = model.predict(test_data)




In [111]:
predictions
#Literally an array with the value of the sigmoid, it was not a 1 or 0, probably its a probability.

array([[0.936],
       [0.031],
       [0.989],
       [0.443],
       [0.871],
       [0.472],
       [0.196],
       [0.104],
       [0.254],
       [0.057],
       [0.072],
       [0.974],
       [0.085],
       [0.078],
       [0.072],
       [0.998],
       [0.077],
       [0.967],
       [0.986],
       [0.59 ],
       [0.077],
       [0.211],
       [0.408],
       [0.085],
       [0.385],
       [0.826],
       [1.   ],
       [0.1  ],
       [0.106],
       [0.031],
       [0.013],
       [0.616],
       [0.999],
       [0.938],
       [0.073],
       [0.504],
       [0.114],
       [0.583],
       [0.088],
       [0.997],
       [0.82 ],
       [0.115],
       [0.083],
       [0.042],
       [0.088],
       [0.128],
       [0.734],
       [0.073],
       [0.867],
       [0.999],
       [0.078],
       [0.411],
       [0.066],
       [0.394],
       [0.45 ],
       [0.838],
       [0.974],
       [0.039],
       [0.232],
       [0.997],
       [0.285],
       [0.426],
       [

In [112]:
#list(test_data) te da una lista con todos los batches.
list(test_data)[0][1][:10] #Toma batch 0, array 1 (las labels), #0:10 cambia cada vez que lo corres. creo que estas agarrando para iterar los primeros 10 (como en predictions)


<tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 0, 0, 0], dtype=int32)>

In [117]:
# Show some results
for prediction, survived in zip(predictions[:10], list(test_data)[0][1][:10]):
  print("Predicted survival: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("SURVIVED" if bool(survived) else "DIED"))
#Note it takes different data points every time you run it.
#Probably because it takes random batches of data from the possibility of :10

Predicted survival: 93.64%  | Actual outcome:  SURVIVED
Predicted survival: 3.11%  | Actual outcome:  DIED
Predicted survival: 98.88%  | Actual outcome:  DIED
Predicted survival: 44.26%  | Actual outcome:  DIED
Predicted survival: 87.09%  | Actual outcome:  DIED


<tf.Tensor: shape=(5,), dtype=int32, numpy=array([0, 0, 1, 0, 0], dtype=int32)>