# Proyecto Final 

In [120]:
import pandas as pd
import numpy as np
import tensorflow as tf
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import graphviz
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz, DecisionTreeClassifier
import pandas as pd
from sklearn import svm
from math import sqrt, pi, exp 

sns.set(style="darkgrid")
if tf.__version__.startswith("2."):
  import tensorflow.compat.v1 as tf
  tf.compat.v1.disable_v2_behavior()
  tf.compat.v1.disable_eager_execution()



In [146]:
# Carga de datos
all_data = pd.read_csv('data_titanic_proyecto.csv')
# train_set, test_set = train_test_split(all_data, test_size=0.2)
# train_set, validation_set = train_test_split(train_set, test_size=0.2)
# 
# print("all_data shape", all_data.shape)
# print("train_set shape", train_set.shape)
# print("validation_set shape", validation_set.shape)



## Análisis exploratorio y limpieza de datos

Buscaremos valores faltantes en todo el dataset para saber si se espera tener que realizar imputación. 

In [147]:
all_data.isna().sum()

PassengerId             0
Name                    0
Age                   177
SibSp                   0
Parch                   0
Ticket                  0
Fare                    0
Cabin                 687
Embarked                2
passenger_class         0
passenger_sex           0
passenger_survived      0
dtype: int64

Vemos que Cabin tiene muchos valores faltantes, por lo que sería difícil realizar imputación, pero podemos utilizar estadística descriptiva para rellenar los valores faltantes en edad.

Veamos una muestra del data set para familiarizarnos con las variables

In [158]:
all_data.sample(4)

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex,passenger_survived
659,660,"Newell, Mr. Arthur Webster",58.0,0,2,35273,113.275,D48,C,1,M,0
664,665,"Lindqvist, Mr. Eino William",20.0,1,0,STON/O 2. 3101285,7.925,,S,3,M,1
872,873,"Carlsson, Mr. Frans Olof",33.0,0,0,695,5.0,B51 B53 B55,S,1,M,0
533,534,"Peter, Mrs. Catherine (Catherine Rizk)",,0,2,2668,22.3583,,C,3,F,1


SibSp es la suma del número de hermanos y cónyugues a bordo para el pasajero. 

Parch es la cantidad de padres e hijos a bordo. 

Embarked es la inicial del puerto donde embarcaró el pasajero. 

Vemos que la clase del pasajero es categórica (Upper, Lower, Middle) pero nos interesa su correlación con otras variables, por lo que la cambiaremos a una categórica numérica. Lo mismo con la variable objetivo, passenger_survived y passenger_sex.

In [161]:
all_data.replace({"Upper" : 1, "Middle" : 2, "Lower" : 3}, inplace=True)
all_data.replace({"Y" : 1, "N" : 0}, inplace=True)
all_data.replace({"M" : 1, "F" : 0}, inplace=True)

Ahora que hemos transformado esas variables, podemos analizar la correlación.

In [162]:
all_data.corr().abs()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,passenger_class,passenger_sex,passenger_survived
PassengerId,1.0,0.036847,0.057527,0.001652,0.012658,0.035144,0.042939,0.005007
Age,0.036847,1.0,0.308247,0.189119,0.096067,0.369226,0.093254,0.077221
SibSp,0.057527,0.308247,1.0,0.414838,0.159651,0.083081,0.114631,0.035322
Parch,0.001652,0.189119,0.414838,1.0,0.216225,0.018443,0.245489,0.081629
Fare,0.012658,0.096067,0.159651,0.216225,1.0,0.5495,0.182333,0.257307
passenger_class,0.035144,0.369226,0.083081,0.018443,0.5495,1.0,0.1319,0.338481
passenger_sex,0.042939,0.093254,0.114631,0.245489,0.182333,0.1319,1.0,0.543351
passenger_survived,0.005007,0.077221,0.035322,0.081629,0.257307,0.338481,0.543351,1.0


### Clasificador Naive Bayes

In [25]:
class NaiveBayes:
    def __init__(self, train_x, train_y, x_labels=None):
        self.x_labels = x_labels
        df = self.__init_dataframe(train_x, train_y)
        by_class = df.groupby('y')
        self.means = by_class.mean()
        self.stdvs = by_class.std()
        self.cdist = by_class['y'].count() / df.shape[0]

    def __init_dataframe(self, train_x, train_y):
        df = pd.DataFrame()
        df['y'] = train_y
        for i,x in enumerate(train_x):
            label = self.x_labels[i] if self.x_labels else 'x_' + str(i)
            df[label] = x
        return df

    def predict(self, x_samples):
        probs = []
        for y in self.cdist.index:
            mean = self.means.iloc[y].to_numpy()
            std = self.stdvs.iloc[y].to_numpy()
            p_x = self.norm_pdf(x_samples, mean, std)
            probs.append(self.cdist[y] * np.prod(p_x, axis=1))
        probs = np.asarray(probs)
        return np.argmax(probs.T,axis=1)

    def norm_pdf(self, x, mean, std):
        e = np.exp(-((x - mean)**2 / (2 * std**2 )))
        return (1 / (sqrt(2*pi) * std)) * e


### Clasificador binario basado en regresión logísitca

In [93]:
  class BinaryLogisticRegression:
    def __init__(self, x, y):
        # Si x es unidimensional, convertir a columna. 
        # Si x es matriz, asumir una variable por columna.
        self.x = x.reshape(-1,1) if x.ndim == 1 else x 
        self.y = y.reshape(-1, 1)
        self.weights = None
        self.epochs = None 
        self.lr = None
        self.print_rate = None
        self.feed = None
        self.batch_size = None
        self.batch_iters = None
    
    def train(self, epochs, lr, batch_size, print_rate=10):
        self.print_rate = print_rate
        self.epochs = epochs
        self.lr = lr
        self.batch_size = min(batch_size, self.x.shape[0])
        self.batch_iters = int(self.x.shape[0]/self.batch_size)

        g = tf.Graph()
        with g.as_default():
            return self.__do_train()

    
    def __do_train(self):
        placeholder_x = tf.placeholder(tf.float64, [self.batch_size, self.x.shape[1]], "x")
        placeholder_y = tf.placeholder(tf.float64, [self.batch_size, self.y.shape[1]], "y")

        with tf.Session() as session:
            (weights_op, error_op) = self.__gradient_descent(placeholder_x, placeholder_y)
            self.before_training(session.graph, weights_op, error_op)
            session.run(tf.global_variables_initializer())    
            for epoch in range(1, self.epochs + 1):
                for i in range(self.batch_iters):
                    start_index  = i*self.batch_size
                    end_index = start_index + self.batch_size

                    x_batch = np.array(self.x[start_index:end_index])
                    y_batch = np.array(self.y[start_index:end_index])
                    self.feed = { placeholder_x: x_batch, placeholder_y: y_batch }


                    out = session.run([weights_op, error_op], self.feed)
                    self.error = out[1]
                    self.weights = out[0]
                    self.after_epoch(epoch, session)

    def __gradient_descent(self, x, y):
        n_samples = x.shape[0]
        n_independent_vars = x.shape[1] + 1

        bias_feature = tf.ones([n_samples, 1], tf.float64)
        x = tf.concat([x, bias_feature], axis=1)

        initial_weights = tf.zeros([n_independent_vars, 1], tf.float64)
        weights = tf.Variable(name="Weights", initial_value=initial_weights)
        logits = tf.matmul(x, weights)
        error = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=logits,labels=y))
        gradients = tf.gradients(error, weights)
        adjustment = tf.scalar_mul(-self.lr, gradients[0])
        weights = tf.assign(weights, (tf.add(weights, adjustment)))
        return (weights, error)

    def after_epoch(self, epoch, session):
        if epoch % self.print_rate == 0 or epoch == self.epochs:
            print("epoch:" + str(epoch) + " error: " + str(self.error))
    
    def before_training(self, graph, weights_op, error_op):
        # overridear para inicializar tensorboard, etc
        pass

    def predict(self, x):
        x = np.hstack((x,np.ones((x.shape[0],1))))
        logits = np.matmul(x, self.weights)
        logits_ph = tf.placeholder(tf.float32, logits.shape)
        with tf.Session() as session:
            feed = {logits_ph: logits}
            probs = session.run(tf.nn.sigmoid(logits_ph), feed_dict=feed)
            session.close()
        y = np.array(list(map(lambda p: 1 if p > 0.5 else 0, probs)))
        return y

In [165]:

# logreg = BinaryLogisticRegression(df[["weather_encoded","temp_encoded"]].to_numpy(), df["Y"].to_numpy())
# logreg.train(2000, 0.6, 8, print_rate=500)

In [166]:
# logreg.predict(x_test)

### Scikit Learn DecisionTreeClassifier 

In [164]:
def train_decision_tree(x, y):
    decision_tree = DecisionTreeClassifier()
    decision_tree.fit(x, y)
    return decision_tree

# gv_data = export_graphviz(decision_tree, out_file=None,
#                                         feature_names=["weather","temp"],
#                                         class_names=["not_play","play"],
#                                         filled=True, rounded=True,
#                                         special_characters=True)
# graph = graphviz.Source(gv_data)



### Support Vector Machine

In [163]:
def train_svm(x,y):
    svm_classifier = svm.SVC(kernel='linear')
    svm_classifier.fit(x,y)
    return svm_classifier