# Leccion 5 Fundamentos del BigData

### Feature Engineering

In [1]:
# Principales dependencias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importamos algunos algoritmos de clasificación
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

In [3]:
# Una posible forma para evaluar nuestro modelo
from sklearn.metrics import accuracy_score

In [4]:
df = pd.read_csv("train.csv")
df = df.drop('PassengerId', axis=1)
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Name          0
Sex           0
Age         177
SibSp         0
Parch         0
Ticket        0
Fare          0
Cabin       687
Embarked      2
dtype: int64

### -1- Name - no la tendremos en cuenta por simplificar

#### df["Name"]=.str.extract('([A-Za-z]+).',expand=False)
####  sería uan posible forma de analizar la columna Name, pero no lo haremos

### -2- Age - Usamos el valor promedio de la columna para rellenar los valores faltantes

In [6]:
df.Age.isnull().sum()

177

In [7]:
df.Age = df.Age.fillna(df.Age.mean())

In [8]:
df.Age.isnull().sum()

0

### -3- Ticket - No la tendremos en cuenta por simplificar

In [9]:
df.Ticket.value_counts()

347082      7
CA. 2343    7
1601        7
3101295     6
CA 2144     6
           ..
9234        1
19988       1
2693        1
PC 17612    1
370376      1
Name: Ticket, Length: 681, dtype: int64

### -4- Cabin - No la tendremos en cuenta por falta de información

In [10]:
df.Cabin.isnull().sum(), len(df)

(687, 891)

### -5- Embarked

In [11]:
df.Embarked.isnull().sum()

2

In [12]:
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [13]:
df["Embarked"] = df["Embarked"].fillna("S")
df.Embarked.value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

#### Borramos del DataFrame las columnas mencionadas

In [14]:
df.head(2)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [15]:
df = df.drop(["Name", "Ticket", "Cabin"], axis=1)
df.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C


# Datos Categóricos

In [16]:
# Una vez tenemos nuestro DataFrame(df)
# Lo siguiente será trabajar con las columnas que tienen texto.

In [17]:
# pd.get_dummies?

In [18]:
# drop_first=True porque queremos evitar multicolinearidad
# de hecho en Sex si decimos que es hombre, ya no es por ende, mujer. 
# entonces siendo 1 Lógico Hombre, por ejemplo, sería redundante.
df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True) 
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,0,22.0,1,0,7.25,1,0,1,0,1
1,1,38.0,1,0,71.2833,0,0,0,0,0
2,1,26.0,0,0,7.925,0,0,1,0,1
3,1,35.0,1,0,53.1,0,0,0,0,1
4,0,35.0,0,0,8.05,1,0,1,0,1


## Escalado de los datos

In [19]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

In [20]:
# Standsrd Scale 

# x-mean(x)/std(x)

df.Age = (df.Age - np.mean(df.Age, axis=0)) / (np.std(df.Age, axis=0))
df.Fare = (df.Fare - np.mean(df.Fare, axis=0)) / (np.std(df.Fare, axis=0))
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,0,-0.592481,1,0,-0.502445,1,0,1,0,1
1,1,0.638789,1,0,0.786845,0,0,0,0,0
2,1,-0.284663,0,0,-0.488854,0,0,1,0,1
3,1,0.407926,1,0,0.42073,0,0,0,0,1
4,0,0.407926,0,0,-0.486337,1,0,1,0,1


# Obtención x,y

In [21]:
x = df.drop("Survived", axis=1)
x.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,-0.592481,1,0,-0.502445,1,0,1,0,1
1,0.638789,1,0,0.786845,0,0,0,0,0
2,-0.284663,0,0,-0.488854,0,0,1,0,1
3,0.407926,1,0,0.42073,0,0,0,0,1
4,0.407926,0,0,-0.486337,1,0,1,0,1


In [22]:
y = df["Survived"]
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [23]:
"""x= x.values
y = y.values """

'x= x.values\ny = y.values '

# Entrenamiento y prueba

In [24]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [49]:
x_train.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
331,1.215947,0,0,-0.074583,1,0,0,0,1
733,-0.515526,0,0,-0.386671,1,1,0,0,1
382,0.177063,0,0,-0.488854,1,0,1,0,1
704,-0.284663,1,0,-0.49028,1,0,1,0,1
813,-1.82375,4,2,-0.018709,0,0,1,0,1


In [50]:
x_test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
709,0.0,1,1,-0.341452,1,0,1,0,0
439,0.100109,0,0,-0.437007,1,1,0,0,1
840,-0.746389,0,0,-0.488854,1,0,1,0,1
720,-1.82375,0,1,0.016023,0,1,0,0,1
39,-1.208115,1,0,-0.422074,0,0,1,0,0


In [51]:
y_train.head()

331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64

In [52]:
y_test.head()

709    1
439    0
840    0
720    1
39     1
Name: Survived, dtype: int64

# Pruebo posibles algoritmos

In [53]:
# KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_KN = accuracy_score(y_test, y_pred)
acc_KN

0.8212290502793296

In [54]:
# DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_DT = accuracy_score(y_test, y_pred)
acc_DT

0.776536312849162

In [55]:
# RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

0.8100558659217877

In [56]:
 # GaussianNB
clf = GaussianNB()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_NB = accuracy_score(y_test, y_pred)
acc_NB

0.7653631284916201

In [57]:
# SVC 
clf = SVC()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_SVC = accuracy_score(y_test, y_pred)
acc_SVC

0.8156424581005587

# Busco el que a priori mejor predice

In [34]:
# A priori, y sin ver más parámetros..

# RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
acc_RF = accuracy_score(y_test, y_pred)
acc_RF

0.8100558659217877

# Utilizo ese entrenamiento para test.csv

In [35]:
test = pd.read_csv("test.csv")
test.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [36]:
df_original = pd.read_csv("train.csv")
df_original.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


# Cambios en "test"

In [37]:
test.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,3.0,39.0,1.0,0.0,31.5
max,1309.0,3.0,76.0,8.0,9.0,512.3292


In [38]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [39]:
# df.Age = df.Age.fillna(df.Age.mean())
test.Age = test.Age.fillna(test.Age.mean())
test.Fare = test.Fare.fillna(test.Fare.mean())

In [40]:
test.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

In [41]:
# df.drop(["Name", "Ticket", "Cabin"], axis=1)
test = test.drop(["Name", "Ticket", "Cabin"], axis=1)
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [42]:
# df.Age = (df.Age - np.mean(df.Age, axis=0)) / (np.std(df.Age, axis=0))
# df.Fare = (df.Fare - np.mean(df.Fare, axis=0)) / (np.std(df.Fare, axis=0))
test.Age = (test.Age - np.mean(test.Age, axis=0)) / (np.std(test.Age, axis=0))
test.Fare = (test.Fare - np.mean(test.Fare, axis=0)) / (np.std(test.Fare, axis=0))
test.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,0.334993,0,0,-0.498407,Q
1,893,3,female,1.32553,1,0,-0.513274,S
2,894,2,male,2.514175,0,0,-0.465088,Q
3,895,3,male,-0.25933,0,0,-0.483466,S
4,896,3,female,-0.655545,1,1,-0.418471,S


In [43]:
# df = pd.get_dummies(df, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True) 
test = pd.get_dummies(test, columns=['Sex', 'Pclass', 'Embarked'], drop_first=True) 
test.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,892,0.334993,0,0,-0.498407,1,0,1,1,0
1,893,1.32553,1,0,-0.513274,0,0,1,0,1
2,894,2.514175,0,0,-0.465088,1,1,0,1,0
3,895,-0.25933,0,0,-0.483466,1,0,1,0,1
4,896,-0.655545,1,1,-0.418471,0,0,1,0,1


In [44]:
test = test.drop("PassengerId", axis=1)
test.head()

Unnamed: 0,Age,SibSp,Parch,Fare,Sex_male,Pclass_2,Pclass_3,Embarked_Q,Embarked_S
0,0.334993,0,0,-0.498407,1,0,1,1,0
1,1.32553,1,0,-0.513274,0,0,1,0,1
2,2.514175,0,0,-0.465088,1,1,0,1,0
3,-0.25933,0,0,-0.483466,1,0,1,0,1
4,-0.655545,1,1,-0.418471,0,0,1,0,1


In [45]:
# RandomForestClassifier - el "mejor" bajo estas premisas
# y en este caso para "test"
clf = RandomForestClassifier()
# entreno con los datos que tenía del primer dataset
clf.fit(x_train, y_train)
# ahora hago la predicción sobre "test"
y_predecida = clf.predict(test)
y_predecida

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

# Me creo un dataframe con esta información

In [46]:
df_submission = pd.read_csv("gender_submission.csv")
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


### Ahora le pego la información DE MI PREDICCIÓN

In [47]:
df_submission["Survived"] = y_predecida
df_submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0


In [48]:
df_submission = pd.DataFrame({'PassengerId': df_submission.PassengerId, 'Survived': df_submission.Survived})
df_submission.to_csv('titanic.csv', index=False)

#### Según los resultados obtenidos el algoritmo "RandomForestClassifier" supera a los demás, subí a Kaggle con "KNeighborsClassifier" y fue peor que con "RandomForestClassifier".

### Ya hablamos que depende del dispositivo que use cada persona y lo mejor es probar todos los que se conozcamos.