In [1]:
import pandas as pd

data = pd.read_csv('tested.csv')

data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [2]:
# Eliminamos columnas no deseadas:
data.drop(["PassengerId","Name","Cabin","Ticket"],axis=1,inplace=True)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [3]:
# Recuento de valores faltantes en cada columna:
print(data.isnull().sum())

Survived     0
Pclass       0
Sex          0
Age         86
SibSp        0
Parch        0
Fare         1
Embarked     0
dtype: int64


In [4]:
data.shape

(418, 8)

In [5]:
# Eliminación de filas con valores faltantes:
data.dropna(subset=['Fare','Age'],inplace=True)
print(data.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 331 entries, 0 to 415
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  331 non-null    int64  
 1   Pclass    331 non-null    int64  
 2   Sex       331 non-null    object 
 3   Age       331 non-null    float64
 4   SibSp     331 non-null    int64  
 5   Parch     331 non-null    int64  
 6   Fare      331 non-null    float64
 7   Embarked  331 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 23.3+ KB


In [7]:
data.shape

(331, 8)

In [8]:
# Codificación One hot encoding para variables categóricas
# Creación Dymmies:
dummies = pd.get_dummies(data.Sex)
dummies2 = pd.get_dummies(data.Embarked)

In [9]:
# Agregamos el dataset de los dummies con el dataset original:
new_df = pd.concat([data,dummies,dummies2],axis='columns')
print(type(new_df))

<class 'pandas.core.frame.DataFrame'>


In [10]:
# Eliminamos columnas que no usaremos:
new_df.drop(['Sex','Embarked'],axis='columns',inplace=True)
print(new_df.head(10))

new_df.info()

   Survived  Pclass   Age  SibSp  Parch     Fare  female   male      C      Q  \
0         0       3  34.5      0      0   7.8292   False   True  False   True   
1         1       3  47.0      1      0   7.0000    True  False  False  False   
2         0       2  62.0      0      0   9.6875   False   True  False   True   
3         0       3  27.0      0      0   8.6625   False   True  False  False   
4         1       3  22.0      1      1  12.2875    True  False  False  False   
5         0       3  14.0      0      0   9.2250   False   True  False  False   
6         1       3  30.0      0      0   7.6292    True  False  False   True   
7         0       2  26.0      1      1  29.0000   False   True  False  False   
8         1       3  18.0      0      0   7.2292    True  False   True  False   
9         0       3  21.0      2      0  24.1500   False   True  False  False   

       S  
0  False  
1   True  
2  False  
3   True  
4   True  
5   True  
6  False  
7   True  
8  False 

In [11]:
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [12]:
from sklearn.ensemble import RandomForestClassifier


# Dividimos el dataset en características y clase target:
x = new_df[new_df.columns.difference(['Survived'])]
y = new_df['Survived']

classifier = RandomForestClassifier()
classifier.fit(x, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
x.head()

Unnamed: 0,Age,C,Fare,Parch,Pclass,Q,S,SibSp,female,male
0,34.5,False,7.8292,0,3,True,False,0,False,True
1,47.0,False,7.0,0,3,False,True,1,True,False
2,62.0,False,9.6875,0,2,True,False,0,False,True
3,27.0,False,8.6625,0,3,False,True,0,False,True
4,22.0,False,12.2875,1,3,False,True,1,True,False


In [14]:
import joblib

classifier = RandomForestClassifier()
classifier.fit(x, y)

# guardamos
joblib.dump(classifier, 'Titanic.pkl')

['Titanic.pkl']

In [15]:
print(x.columns.tolist())


['Age', 'C', 'Fare', 'Parch', 'Pclass', 'Q', 'S', 'SibSp', 'female', 'male']


In [16]:
import joblib

classifier = joblib.load("Titanic.pkl")

df_prueba = pd.DataFrame([{
    'Age': 34,
    'C': False,
    'Fare': 7.8292,
    'Parch': 0,
    'Pclass': 3,
    'Q': True,
    'S': False,
    'SibSp': 0,
    'female': False,
    'male': True
}])

prediccion = classifier.predict(df_prueba)

print("Predicción:", prediccion)

Predicción: [0]
