In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import tensorflow as tf
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score,mean_absolute_error
from sklearn.externals import joblib

## Proyecto Final Statistical Learning

### Parte 1: Entrenamiento, Selección y Validación

#### En esta parte se carga el dataset y se hace split en tres partes las cuales son entrenamiento, validacion y test. Solo se usara por el momento entrenamiento y validacion. Luego se hace una seleccion de variables mediante feature eng. y  se crean los 4 modelos con sus respectivos hiperparametros. Por ultimo se hace una validacion con las metricas de evaluacion de cada modelo

<img src="img\titanic.png">

## Data Loading and Splitting

In [3]:
#carga y split de datos
delimitador = ','
df = pd.read_csv('data_titanic_proyecto.csv',delimitador)

X = df[['PassengerId','Name','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked','passenger_class','passenger_sex']]
y = df['passenger_survived']

#divido datos para train y test
x_trainTemp, x_test, y_trainTemp, y_test = train_test_split(X, y, test_size=0.20, random_state=15)

#vuelvo a dividir datos de trainTemp para train y validacion
x_train, x_val, y_train, y_val = train_test_split(x_trainTemp, y_trainTemp, test_size=0.20, random_state=15)


## Data Exploration and Sizing

In [4]:
x_train.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex
444,445,"Johannesen-Bratthammer, Mr. Bernt",,0,0,65306,8.1125,,S,Lower,M
763,764,"Carter, Mrs. William Ernest (Lucile Polk)",36.0,1,2,113760,120.0,B96 B98,S,Upper,F
465,466,"Goncalves, Mr. Manuel Estanslas",38.0,0,0,SOTON/O.Q. 3101306,7.05,,S,Lower,M
388,389,"Sadlier, Mr. Matthew",,0,0,367655,7.7292,,Q,Lower,M
213,214,"Givard, Mr. Hans Kristensen",30.0,0,0,250646,13.0,,S,Middle,M


In [5]:
x_test.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex
310,311,"Hays, Miss. Margaret Bechstein",24.0,0,0,11767,83.1583,C54,C,Upper,F
635,636,"Davis, Miss. Mary",28.0,0,0,237668,13.0,,S,Middle,F
44,45,"Devaney, Miss. Margaret Delia",19.0,0,0,330958,7.8792,,Q,Lower,F
698,699,"Thayer, Mr. John Borland",49.0,1,1,17421,110.8833,C68,C,Upper,M
750,751,"Wells, Miss. Joan",4.0,1,1,29103,23.0,,S,Middle,F


In [6]:
x_val.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,passenger_class,passenger_sex
215,216,"Newell, Miss. Madeleine",31.0,1,0,35273,113.275,D36,C,Upper,F
463,464,"Milling, Mr. Jacob Christian",48.0,0,0,234360,13.0,,S,Middle,M
27,28,"Fortune, Mr. Charles Alexander",19.0,3,2,19950,263.0,C23 C25 C27,S,Upper,M
421,422,"Charters, Mr. David",21.0,0,0,A/5. 13032,7.7333,,Q,Lower,M
274,275,"Healy, Miss. Hanora ""Nora""",,0,0,370375,7.75,,Q,Lower,F


In [7]:
#sizing datasets
print(np.shape(x_train))
print(np.shape(x_val))
print(np.shape(x_test))

print(np.shape(y_train))
print(np.shape(y_val))
print(np.shape(y_test))

(569, 11)
(143, 11)
(179, 11)
(569,)
(143,)
(179,)


## Label Encoding y Feature Engineering

In [8]:
#haciendo label enconding de la variable Y  survived=1  died=2
y_train=y_train.astype('category')
y_train= y_train.cat.codes
y_train.head()

y_test=y_test.astype('category')
y_test= y_test.cat.codes
y_test.head()

y_val=y_val.astype('category')
y_val= y_val.cat.codes
y_val.head()


215    1
463    0
27     0
421    0
274    1
dtype: int8

In [9]:
##borramos las columnas que consideramos no serviran porque son categoricas o tienen muchos valores null
x_train.drop('Name', axis=1, inplace=True)
x_test.drop('Name', axis=1, inplace=True)
x_val.drop('Name', axis=1, inplace=True)

x_train.drop('Cabin', axis=1, inplace=True)
x_test.drop('Cabin', axis=1, inplace=True)
x_val.drop('Cabin', axis=1, inplace=True)

x_train.drop('Ticket', axis=1, inplace=True)
x_test.drop('Ticket', axis=1, inplace=True)
x_val.drop('Ticket', axis=1, inplace=True)

In [10]:
#x_train.drop('PassengerId', axis=1, inplace=True)
#x_test.drop('PassengerId', axis=1, inplace=True)
#x_val.drop('PassengerId', axis=1, inplace=True)

In [11]:
#llenamos con 0 la variable Age cuando no esta especificada
x_train.isnull().sum()
x_train["Age"]= x_train["Age"].fillna('0')

x_test.isnull().sum()
x_test["Age"]= x_test["Age"].fillna('0')

x_val.isnull().sum()
x_val["Age"]= x_val["Age"].fillna('0')

In [12]:
#hacemos label encoding para las variables Passenger_sex, passenger_class y Embarked
le = preprocessing.LabelEncoder()
x_train['passenger_sex'] = le.fit_transform(x_train['passenger_sex'])
x_train['passenger_class'] = le.fit_transform(x_train['passenger_class'])
x_train["Embarked"] = le.fit_transform(x_train["Embarked"].fillna('0'))
x_train['Embarked'] = le.fit_transform(x_train['Embarked'])


x_test['passenger_sex'] = le.fit_transform(x_test['passenger_sex'])
x_test['passenger_class'] = le.fit_transform(x_test['passenger_class'])
x_test["Embarked"] = le.fit_transform(x_test["Embarked"].fillna('0'))
x_test['Embarked'] = le.fit_transform(x_test['Embarked'])


x_val['passenger_sex'] = le.fit_transform(x_val['passenger_sex'])
x_val['passenger_class'] = le.fit_transform(x_val['passenger_class'])
x_val["Embarked"] = le.fit_transform(x_val["Embarked"].fillna('0'))
x_val['Embarked'] = le.fit_transform(x_val['Embarked'])

In [13]:
x_train.shape, y_train.shape
x_train.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Embarked,passenger_class,passenger_sex
444,445,0,0,0,8.1125,3,0,1
763,764,36,1,2,120.0,3,2,0
465,466,38,0,0,7.05,3,0,1
388,389,0,0,0,7.7292,2,0,1
213,214,30,0,0,13.0,3,1,1


## Bootstrapping

#### Si se hubiera requerido hacer bootstraping se logra generando muchos set de datos a partir del proporcionado, esto mediante un metodo llamado sampling, el dataset se replica con o sin reemplazo en los valores. Sobre estos dataset de muestra se calcula el estadistico que necesitamos para obtener un aproximado del estadistico poblacional, un pseudocodigo de bootstrap se encuentra a continuacion

for i in bootstraps:

	sample = select_sample_with_replacement(data)
    
	stat = calculate_statistic(sample)
    
	statistics.append(stat)

#### El proceso bootstrap se puede usar para evaluar el rendimiento de un algoritmo de machine learning, como cada muestra se hace con el 60% u 80% de los datos, hay datos que nunca pueden ser obtenidos en la muestra, estos se denominan OOB del ingles Out of Bag



<img src="img\ensemble_learning.png">

## MODELOS

## Arbol de Decision

In [14]:

#scoring = 'accuracy'
#score = cross_val_score(mod, x_train, y_train, cv=10, n_jobs=1, scoring=scoring)
#print(score)
#print('Media Decision Tree:', round(np.mean(score)*100, 2))

def train_DecisionTree(X,Y,presrt,split,file):
    decision_tree = DecisionTreeClassifier(presort=presrt,splitter=split) 
    decision_tree.fit(x_train, y_train)  
    y_pred = decision_tree.predict(x_test)      

#model.fit(X_train, Y_train)
# save the model to disk
    filename = file
    joblib.dump(decision_tree, filename)
    
    accuracy = round(decision_tree.score(x_train, y_train) * 100, 2)
    f_1_score = f1_score(y_test, y_pred, average="macro")
    prec_score = precision_score(y_test, y_pred, average="macro")
    rec_score = recall_score(y_test, y_pred, average="macro")
    Mean_ab_error=mean_absolute_error(y_test, y_pred)
    
    return accuracy,f_1_score,prec_score,rec_score,Mean_ab_error


## SVM

In [22]:
#mod = SVC()
#scoring = 'accuracy'
#score = cross_val_score(mod, x_train, y_train, cv=10, n_jobs=1, scoring=scoring)
#print(score)
#print('Media SVM:', round(np.mean(score)*100, 2))

def train_SVM(X,Y,penalty,kern,deg,file):
    linear_svc = SVC(C=penalty,kernel=kern)
    linear_svc.fit(x_train, y_train)
    y_pred = linear_svc.predict(x_test)    

    filename = file
    joblib.dump(linear_svc, filename)

    acc_svc = round(linear_svc.score(x_train, y_train) * 100, 2)
    f_1_score = f1_score(y_test, y_pred, average="macro")
    prec_score = precision_score(y_test, y_pred, average="macro")
    rec_score = recall_score(y_test, y_pred, average="macro")
    Mean_ab_error=mean_absolute_error(y_test, y_pred)
    
    return acc_svc,f_1_score,prec_score,rec_score,Mean_ab_error


## Naive Bayes

In [27]:
#mod = GaussianNB()
#scoring = 'accuracy'
#score = cross_val_score(mod, x_train, y_train, cv=10, n_jobs=1, scoring=scoring)
#print(score)
#print('Media SVM:', round(np.mean(score)*100, 2))

def train_NB(X,Y,file):
    gaussian = GaussianNB() 
    gaussian.fit(x_train, y_train)  
    y_pred = gaussian.predict(x_test)  

    filename = file
    joblib.dump(gaussian, filename)
    acc_NB = round(gaussian.score(x_train, y_train) * 100, 2)
    f_1_score = f1_score(y_test, y_pred, average="macro")
    prec_score = precision_score(y_test, y_pred, average="macro")
    rec_score = recall_score(y_test, y_pred, average="macro")
    Mean_ab_error=mean_absolute_error(y_test, y_pred)

    return acc_NB,f_1_score,prec_score,rec_score,Mean_ab_error


## Regresion Logistica con Regularizacion

In [17]:
import math

def train_test_Reg_Log(session,predict, loss_val, Xd, yd,
              epochs=1, batch_size=64, print_every=100,
              training=None, plot_losses=False,file=""):

    
    
    #--------------------------------------------------------------------------------------------

    #---------------------------------------------------------------------------------------------
    
    
        # have tensorflow compute accuracy
    correct_prediction = tf.equal(tf.argmax(predict,1), y)
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    # shuffle indicies
    train_indicies = np.arange(Xd.shape[0])

    training_now = training is not None
    
    # setting up variables we want to compute (and optimizing)
    # if we have a training function, add that to things we compute
    variables = [cost_op,correct_prediction,accuracy]
    if training_now:
        variables[-1] = training

    # counter 
    iter_cnt = 0
    for e in np.arange(epochs):
        # keep track of losses and accuracy
        correct = 0
        losses = []
        # make sure we iterate over the dataset once
        for i in np.arange(int(math.ceil(Xd.shape[0]/batch_size))):
            # generate indicies for the batch
            start_idx = (i*batch_size)%Xd.shape[0]
            idx = train_indicies[start_idx:start_idx+batch_size]
            # create a feed dictionary for this batch
            feed_dict = {x: Xd.iloc[idx,:],
                         y: yd[idx],
                         is_training: training_now }
            # get batch size
            actual_batch_size = yd[idx].shape[0]
            # have tensorflow compute loss and correct predictions
            # and (if given) perform a training step
            loss, corr, _ = session.run(variables,feed_dict=feed_dict)
            
            if training is not None:
                W = np.asarray(session.run(variables,feed_dict))
                W.dump(file)
            
            # aggregate performance stats
            losses.append(loss*actual_batch_size)
            correct += np.sum(corr)
            
            # print every now and then
            if training_now and (iter_cnt % print_every) == 0:
                print("Iteration {0}: with minibatch training loss = {1:.3g} and accuracy of {2:.2g}"\
                      .format(iter_cnt,loss,np.sum(corr)/actual_batch_size))
            iter_cnt += 1
        total_correct = correct/Xd.shape[0]
        total_loss = np.sum(losses)/Xd.shape[0]
        print("Epoch {2}, Overall loss = {0:.3g} and accuracy of {1:.3g}"
              .format(total_loss,total_correct,e+1))
        
    return total_loss,total_correct

In [19]:

numFeatures = x_train.shape[1]
numLabels = 2

    # clear old variables
tf.reset_default_graph()

x = tf.placeholder(tf.float32, [None, numFeatures])
y = tf.placeholder(tf.int64, [None])
is_training = tf.placeholder(tf.bool)
Lambda = 0.01 #Regularization Parameter
learningRate = 1
    

# Logistic Regression
def Titanicmodel(x,y,is_training):   
    weights=tf.get_variable("weights",shape=[numFeatures,numLabels])
    bias=tf.get_variable("bias",shape=[numLabels])
    y_out = tf.matmul(x,weights)+bias
    return(y_out,weights)


y_out,weights = Titanicmodel(x,y,is_training)
    
loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.one_hot(y,2),logits=y_out))
regularizer = tf.nn.l2_loss(weights)
cost_op = tf.reduce_mean(loss + Lambda * regularizer)
optimizer = tf.train.GradientDescentOptimizer(learningRate)
train_step = optimizer.minimize(cost_op)

    #Prediction
prediction = tf.argmax(y_out,1)
    #Lets strat a session
sess = tf.Session()
sess.run(tf.global_variables_initializer())



#saver.save(sess, 'regLog_lr=0.01_reg=0.1_var1_var2_var3')


Instructions for updating:
Colocations handled automatically by placer.


In [20]:
print('Training')
train_test_Reg_Log(sess,y_out,cost_op,x_train,y_train,10,100,100,train_step,False,file="test.npy")


Training
Iteration 0: with minibatch training loss = 59.1 and accuracy of 0.29
Epoch 1, Overall loss = 106 and accuracy of 0.351
Epoch 2, Overall loss = 94.6 and accuracy of 0.355
Epoch 3, Overall loss = 89.9 and accuracy of 0.374
Epoch 4, Overall loss = 100 and accuracy of 0.306
Epoch 5, Overall loss = 101 and accuracy of 0.33
Epoch 6, Overall loss = 98.1 and accuracy of 0.327
Epoch 7, Overall loss = 110 and accuracy of 0.304
Epoch 8, Overall loss = 122 and accuracy of 0.336
Epoch 9, Overall loss = 104 and accuracy of 0.36
Epoch 10, Overall loss = 101 and accuracy of 0.362


(100.8911056116302, 0.36203866432337434)

In [21]:

print('Validation')
train_test_Reg_Log(sess,y_out,cost_op,x_val,y_val,5,25,25, file="test.npy")

Validation
Epoch 1, Overall loss = 36.6 and accuracy of 0.0699
Epoch 2, Overall loss = 36.6 and accuracy of 0.0699
Epoch 3, Overall loss = 36.6 and accuracy of 0.0699
Epoch 4, Overall loss = 36.6 and accuracy of 0.0699
Epoch 5, Overall loss = 36.6 and accuracy of 0.0699


(36.593211274047, 0.06993006993006994)

## EXPERIMENTACION
### En esta seccion se corren todos los procesos y se anota en un excel que luego se cargara en este notebook manualmente  para cada corrida de cada modelo se guardara el job correspondiente y se anotaran las metricas para tabularlas y mostrarlas en este notebook en las siguiente secciones


In [206]:
#Distintos parametros Decision Trees
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
print('Decision Trees' , train_DecisionTree(x_train, y_train,False,'best',file="DecTree_sort_false_splitter_best_train"))
print('Decision Trees' , train_DecisionTree(x_train, y_train,True,'best',file="DecTree_sort_true_splitter_best_train"))
print('Decision Trees' , train_DecisionTree(x_train, y_train,False,'random',file="DecTree_sort_false_splitter_random_train"))
print('Decision Trees' , train_DecisionTree(x_train, y_train,True,'random',file="DecTree_sort_true_splitter_random_train"))

Accuracy f1_score precision_score recall_score Mean_ab_error
Decision Trees (100.0, 0.7193908564481556, 0.7198076923076924, 0.7190214614205417, 0.2737430167597765)
Decision Trees (100.0, 0.7413377861982594, 0.7429568363918325, 0.7401635155850792, 0.25139664804469275)
Decision Trees (100.0, 0.7001675041876047, 0.6994854417670682, 0.7030531425651507, 0.29608938547486036)
Decision Trees (100.0, 0.7620886075949367, 0.7608750314307267, 0.765074092999489, 0.2346368715083799)


In [23]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
print('SVM',train_SVM(x_train, y_train,1,'rbf',1,file="SVM_Penalty_1_kernel_rbf_degree_1_train"))
print('SVM',train_SVM(x_train, y_train,2,'rbf',1,file="SVM_Penalty_2_kernel_rbf_degree_1_train"))
print('SVM',train_SVM(x_train, y_train,1,'sigmoid',1,file="SVM_Penalty_1_kernel_sigmoid_degree_1_train"))


Accuracy f1_score precision_score recall_score Mean_ab_error
SVM (99.82, 0.3795350556916393, 0.7893258426966292, 0.506578947368421, 0.41899441340782123)
SVM (100.0, 0.3795350556916393, 0.7893258426966292, 0.506578947368421, 0.41899441340782123)
SVM (62.92, 0.36524822695035464, 0.2877094972067039, 0.5, 0.4245810055865922)


In [28]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
print('Naive-Bayes',train_NB(x_train, y_train,file="NBayes_lr=0.01_reg=0.1_train"))


Accuracy f1_score precision_score recall_score Mean_ab_error
Naive-Bayes (78.38, 0.8224717663243434, 0.823076923076923, 0.8219213081246806, 0.17318435754189945)


In [25]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
train_test_Reg_Log(sess,y_out,cost_op,x_train,y_train,5,100,100,train_step,False,file="regLog_lr=0.01_reg=0.001_train")



Accuracy f1_score precision_score recall_score Mean_ab_error
Iteration 0: with minibatch training loss = 205 and accuracy of 0.36
Epoch 1, Overall loss = 115 and accuracy of 0.38
Epoch 2, Overall loss = 87.7 and accuracy of 0.332
Epoch 3, Overall loss = 118 and accuracy of 0.341
Epoch 4, Overall loss = 90.7 and accuracy of 0.334
Epoch 5, Overall loss = 115 and accuracy of 0.385


(115.28579260050098, 0.38488576449912126)

In [29]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
train_test_Reg_Log(sess,y_out,cost_op,x_train,y_train,5,100,100,train_step,False,file="regLog_lr=0.02_reg=0.1_train")


Accuracy f1_score precision_score recall_score Mean_ab_error
Iteration 0: with minibatch training loss = 113 and accuracy of 0.28
Epoch 1, Overall loss = 98.4 and accuracy of 0.359
Epoch 2, Overall loss = 105 and accuracy of 0.35
Epoch 3, Overall loss = 107 and accuracy of 0.343
Epoch 4, Overall loss = 95.2 and accuracy of 0.334
Epoch 5, Overall loss = 114 and accuracy of 0.385


(113.83808224481942, 0.38488576449912126)

In [30]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
train_test_Reg_Log(sess,y_out,cost_op,x_train,y_train,5,100,100,train_step,False,file="regLog_lr=0.5_reg=0.01_train")


Accuracy f1_score precision_score recall_score Mean_ab_error
Iteration 0: with minibatch training loss = 121 and accuracy of 0.28
Epoch 1, Overall loss = 89.7 and accuracy of 0.362
Epoch 2, Overall loss = 101 and accuracy of 0.33
Epoch 3, Overall loss = 96.5 and accuracy of 0.364
Epoch 4, Overall loss = 113 and accuracy of 0.383
Epoch 5, Overall loss = 100 and accuracy of 0.357


(100.150464612696, 0.35676625659050965)

In [31]:
print('Accuracy','f1_score','precision_score','recall_score','Mean_ab_error')
train_test_Reg_Log(sess,y_out,cost_op,x_train,y_train,5,100,100,train_step,False,file="regLog_lr=1.0_reg=0.01_train")


Accuracy f1_score precision_score recall_score Mean_ab_error
Iteration 0: with minibatch training loss = 153 and accuracy of 0.28
Epoch 1, Overall loss = 104 and accuracy of 0.351
Epoch 2, Overall loss = 109 and accuracy of 0.339
Epoch 3, Overall loss = 92.1 and accuracy of 0.334
Epoch 4, Overall loss = 111 and accuracy of 0.381
Epoch 5, Overall loss = 100 and accuracy of 0.359


(100.421998250254, 0.3585237258347979)

## Tecnica KFold
#### En la validación cruzada de K iteraciones o K-fold cross-validation los datos se dividen en K subconjuntos (folds). Uno de los subconjuntos se utiliza como datos de prueba y el resto (K-1) como datos de entrenamiento.  El proceso de validación cruzada es repetido durante K iteraciones, con cada uno de los posibles subconjuntos de datos de prueba.

#### Como se hubiera usado K-fold en el proyecto? primero se define un numero K para hacer los folds, en este caso puede ser 5, luego se toma el data set y se entrena con los K-1 folds, repetir hasta agotar el k-esimo elemento y promediar los resultados para obtener un solo valor de performance que servira de metrica para nuestro modelo

<img src="img\KFold-cross_validation.png">

In [39]:
#se cargan los datos de experimentos capturados en cada prueba anterior
data = pd.read_csv("experimentos.csv") 

data.head(50)

Unnamed: 0,Experimento,Accuracy,f1_score,precision_score,recall_score,Mean_ab_error
0,DecTree_sort_false_splitter_best_train,100.0,0.719391,0.719808,0.719021,0.273743
1,DecTree_sort_true_splitter_best_train,100.0,0.741338,0.742957,0.740164,0.251397
2,DecTree_sort_false_splitter_random_train,100.0,0.700168,0.699485,0.703053,0.296089
3,DecTree_sort_true_splitter_random_train,100.0,0.762089,0.760875,0.765074,0.234637
4,SVM_Penalty_1_kernel_rbf_degree_1_train,99.82,0.379535,0.789326,0.506579,0.418994
5,SVM_Penalty_1_kernel_rbf_degree_1_train,100.0,0.379535,0.789326,0.506579,0.418994
6,SVM_Penalty_1_kernel_sigmoid_degree_1_train,62.92,0.365248,0.287709,0.5,0.424581
7,NBayes_lr=0.01_reg=0.1_train,78.38,0.822472,0.823077,0.821921,0.173184
8,regLog_lr=0.01_reg=0.001_train,38.488576,0.0,0.0,0.0,0.115286
9,regLog_lr=0.02_reg=0.1_train,38.488576,0.0,0.0,0.0,0.113838


## CONCLUSIONES DE EXPERIMENTACION

### En esta etapa se hicieron varios modelos y se ejecutaron en los datasets de entrenamiento, posteriormente se aplicaron en datos de validacion y se obtuvieron las metricas siguientes Accuracy, F1, Precision, Recall y Mean_Errors.  Durante todo este proceso me doy cuenta que hay modelos mas rapidos que otros y modelos que tienen mejores metricas que otros, sin embargo, no hay ninguno que sea mejor en todas las metricas. En mi caso el modelo que mejor se ha desempeñado es el de Naive-Bayes en el cual se logro un nivel de exactitud mayor al 80%. La dificultad mayor enfrentada en este proyecto fue hacer funcionar el modelo de regresion lineal en tensorflow con regularizacion.  Otra dificultad fue probar distintos tipos de kernel para el modelo de SVM ya que al probar polinomial (poly) y lineal (linear) se quedaba ejecutando sin responder durante muchos minutos, por lo cual solamente se probo con el valor default  rbf y sigmoid, siendo el mejor el kernel default (rbf)