In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [101]:
titanic = sns.load_dataset('titanic')

duplicated_features = ['class', 'who', 'embark_town', 'alive', 'adult_male', 'pclass']
titanic = titanic.drop(columns=duplicated_features)

titanic

Unnamed: 0,survived,sex,age,sibsp,parch,fare,embarked,deck,alone
0,0,male,22.0,1,0,7.2500,S,,False
1,1,female,38.0,1,0,71.2833,C,C,False
2,1,female,26.0,0,0,7.9250,S,,True
3,1,female,35.0,1,0,53.1000,S,C,False
4,0,male,35.0,0,0,8.0500,S,,True
...,...,...,...,...,...,...,...,...,...
886,0,male,27.0,0,0,13.0000,S,,True
887,1,female,19.0,0,0,30.0000,S,B,True
888,0,female,,1,2,23.4500,S,,False
889,1,male,26.0,0,0,30.0000,C,C,True


In [102]:
titanic.isna().sum(axis=0)

survived      0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
alone         0
dtype: int64

In [103]:
# Обработка пропусков

## Embarked - заменяем пропуски модой
titanic['embarked'] = titanic['embarked'].fillna(titanic.embarked.mode().iloc[0])

## Age - замена средним
# titanic['age'] = titanic['age'].fillna(titanic.age.mean())
titanic = titanic.dropna(subset=['age'])

## Deck - замена специальным значением
titanic.deck = titanic.deck.astype('str')
titanic.loc[titanic.deck.eq('nan'), 'deck'] = 'U'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [104]:
# Кодирование категориальных признаков

## sex - у нас бинарный, там всё просто
sex_map = {sex: i for i, sex in enumerate(titanic.sex.unique())}
titanic.sex = titanic.sex.map(sex_map)

## embarked/deck - похитрее
features = ['embarked', 'deck']

for feature in features:
    for cat in titanic[feature].unique():
        col_name = f'{feature}_{cat}'
        values = titanic[feature].eq(cat)
        
        titanic[col_name] = values.astype(int)
    
    titanic = titanic.drop(columns=[feature])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic[col_name] = values.astype(int)


In [105]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [106]:
target_feature = 'survived'
X = titanic.drop(columns=target_feature)
y = titanic[target_feature]

val_percentage = 0.2
# np.random.seed(1)

rf_acc = []
tree_acc = []

for i in range(100):
    train_rows = np.random.rand(y.size) > val_percentage

    X_train = X[train_rows]
    X_val = X[~train_rows]

    y_train = y[train_rows]
    y_val = y[~train_rows]

    rmodel = RandomForestClassifier()
    rmodel.fit(X_train, y_train)

    rf_predict = rmodel.predict(X_val)
    rf_accuracy = (rf_predict == y_val).mean() *  100 

    model = DecisionTreeClassifier(max_depth=3, )
    model.fit(X_train, y_train)

    tree_predict = model.predict(X_val)
    tree_accuracy = (tree_predict == y_val).mean() *  100

    rf_acc.append(rf_accuracy)
    tree_acc.append(tree_accuracy)
    
tree_accuracy = np.mean(tree_acc)
rf_accuracy = np.mean(rf_acc)

print(f'Tree accuracy: {tree_accuracy:.2f}%')
print(f'Random Forest accuracy: {rf_accuracy:.2f}%')

Tree accuracy: 77.66%
Random Forest accuracy: 76.92%


In [100]:
imp = rmodel.feature_importances_
imp_dict = {name: importance for name, importance in zip(X.columns, imp)}

top5 = sorted(imp_dict, key=imp_dict.get, reverse=True)[:5]

for feat in top5:
    print(feat, imp_dict[feat])

fare 0.2218766575273096
age 0.21886514376191363
adult_male 0.16007494504088307
sex 0.12887437616944358
pclass 0.08189086603988517
