# Titanic data science project
- End-to-end en data science (Clasificacion)
- Fuente: https://www.kaggle.com/jeffd23/scikit-learn-ml-from-start-to-finish#
- Solucion del Titanic Survival competition en Kaggle: https://www.kaggle.com/c/titanic

In [None]:
# importar paquetes necesarios
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [None]:
# cargar el dataset https://www.kaggle.com/c/titanic/data

ruta_train = os.path.join("res", "titanic", "train.csv")
ruta_test = os.path.join("res", "titanic", "test.csv")

train_data = pd.read_csv(ruta_train)
test_data = pd.read_csv(ruta_test)
display(train_data.sample(3))
display(test_data.sample(3))

In [None]:
print(train_data.shape)
print(test_data.shape)
display(train_data.describe(include = 'all'))
display(test_data.describe(include = 'all'))

In [None]:
# visualizacion
sns.barplot(x='Embarked',y='Survived',hue='Sex',data=train_data)
plt.show()

In [None]:
sns.pointplot(x='Pclass',y='Survived',hue='Sex',data=train_data,palette={'male':'red', 'female':'blue'})
plt.show()

In [None]:
# limpieza de datos
# categorizacion de Age (edad, en categorias)
so = list(train_data.Age.unique())
so.sort()
print(so)
# ¿Y el test set?
train_data.hist(column='Age')
plt.show()

In [None]:
# Creación de categorías por rangos, discretizar
def cat_age(data):
    data['Age'] = data['Age'].fillna(-0.5)
    bins = (-1,0,12,18,30,60,120)
    cat_names = ['Unknown','Child','Teenager','Young adult','Adult','Elderly']
    categories = pd.cut(data['Age'],bins,labels=cat_names)
    data['Age'] = categories
    return data

train_data = cat_age(train_data)
test_data = cat_age(test_data)

train_data['Age'].sample(10)

In [None]:
# Conocer la distribución de las cabinas con un count plot
sns.countplot(x='Cabin', data=train_data)
plt.show()

In [None]:
# extraccion de la letra de la cabina (no numero)
def extract_cabin(data):
    data['Cabin'] = data['Cabin'].fillna('N')
    data['Cabin'] = data['Cabin'].apply(lambda x : x[0])
    return data

train_data = extract_cabin(train_data)
test_data = extract_cabin(test_data)

train_data.Cabin.sample(10)

In [None]:
# categorizacion de Fare (precio del ticket, en cuartiles)
def cat_fare(data):
    data['Fare'] = data['Fare'].fillna(-0.5)
    cat_names = ['Unknown','1st','2nd','3rd','4rd']
    data['Fare'] = pd.qcut(data['Fare'],5,labels=cat_names)
    return data

train_data = cat_fare(train_data)
test_data = cat_fare(test_data)

train_data.Fare.sample(10)

In [None]:
# extraer info del nombre (Mr, Ms...)
def extract_title(data):
    data['Title'] = data['Name'].apply(lambda x : x.split(' ')[1])
    return data

train_data = extract_title(train_data)
test_data = extract_title(test_data)

train_data.Title.sample(10)

In [None]:
# Eliminacion de datos poco informativos (Name, Ticket)
def drop_columns(data,columns):
    return data.drop(columns,axis=1)

columns_to_drop = ['Name','Ticket','Embarked']
train_data = drop_columns(train_data,columns_to_drop)
test_data = drop_columns(test_data,columns_to_drop)

train_data.head(10)

In [None]:
# representar los datos categoricos
# Edad vs supervivencia
sns.barplot(x='Age',y='Survived',hue='Sex',data=train_data)

In [None]:
# Precio del ticket vs supervivencia
sns.barplot(x='Fare',y='Pclass',hue='Sex',data=train_data)

In [None]:
# Preparar para machine learning
# transformar datos en etiquetas numericas
from sklearn import preprocessing
def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Title']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
        
    return df_train, df_test
    
train_data, test_data = encode_features(train_data, test_data)
train_data.head()

In [None]:
from sklearn.model_selection import train_test_split

# dividir train/validating sets
X = train_data.drop(['Survived'],axis=1)
Y = train_data['Survived']

validation_size=0.15 # qué porcentaje del dataset nos servirá para validar
seed = np.random.randint(1000)
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [None]:
# Decision tree ML
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train,Y_train)

In [None]:
# Evaluacion
# test accuracy on validation set
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

prediction = model.predict(X_validation)
print('Accuracy: {}'.format(accuracy_score(prediction,Y_validation)))
print(confusion_matrix(Y_validation,prediction))
print(classification_report(Y_validation,prediction))

In [None]:
# Se eliminan columnas que pueden tener poca/nula relevancia en el entrenamiento
X = train_data.drop(['Survived', 'PassengerId'],axis=1)
Y = train_data['Survived']

validation_size=0.15
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

model = DecisionTreeClassifier()
model.fit(X_train,Y_train)

prediction = model.predict(X_validation)
print('Accuracy: {}'.format(accuracy_score(prediction,Y_validation)))
print(confusion_matrix(Y_validation,prediction))
print(classification_report(Y_validation,prediction))

In [None]:
X = train_data.drop(['Survived', 'PassengerId'],axis=1)
Y = train_data['Survived']

ls = []
for i in range(5):
    validation_size=0.15
    X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)

    model = DecisionTreeClassifier()
    model.fit(X_train,Y_train)

    prediction = model.predict(X_validation)
    ls.append(accuracy_score(prediction,Y_validation))
    
print(ls)
sum(ls)/len(ls)

# Iris PCA

In [None]:
df_iris = sns.load_dataset('iris')
df_iris.sample(5)

In [None]:
sns.pairplot(data=df_iris, hue='species')

In [None]:
display(df_iris.corr())
sns.heatmap(df_iris.corr(), square=True, annot=True)

In [None]:
# calc autovectores/autovalores
from sklearn.decomposition import PCA  #Analisis de componentes principales
from sklearn.preprocessing import StandardScaler

#normalizacion de datos, eliminar variables no predictoras
df_pca = df_iris.drop('species', axis=1)
display(df_pca.head())

x_scaled = StandardScaler()
x_scaled.fit(df_pca)
scaled = x_scaled.transform(df_pca)
print(scaled)

In [None]:
#analisis de componentes principales PCA
#reducir dimensiones y generar autovectores que reunan todas las caracteristicas (acumulan varianza) de 4 a 3
n_components = 3
pca = PCA(n_components = n_components)
pca.fit(scaled)

In [None]:
x_pca = pca.transform(scaled)
x_pca.shape

In [None]:
#varianza acumulada en los autovectores (explica caracteristicas que reunen el cjto de valores)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.show()
#comp1 0.72 varianza acumulada
#comp2 0.94 varianza acumulada
#comp3 0.98 varianza acumulada
columns = [i for i in pca.explained_variance_ratio_]
columns

In [None]:
columns = [f'PC{i+1}' for i, v in enumerate(pca.explained_variance_ratio_)]
columns

In [None]:
df = pd.DataFrame(x_pca, columns=columns)  #autovectores 
display(df)
df.shape

In [None]:
#grafico de dispersion de 2 primeras componentes
plt.scatter(df.PC1, df.PC2)
plt.show()

In [None]:
sns.scatterplot(data=df, x='PC1', y='PC2', hue='species')

In [None]:
#no hay species en el dataframe
df.head()

In [None]:
#hay que hacer join/merge para traer las species
df_joined = df.join(df_iris['species'], how='inner')
display(df_joined.sample(5))

In [None]:
#las tres species se pueden separar bien (PC1 y PC2 absorben la mayor parte de la varianza)
sns.scatterplot(data=df_joined, x='PC1', y='PC2', hue='species')

In [None]:
pca.explained_variance_ratio_