#1. Импортируем нужные библиотеки

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

%matplotlib inline

#2. Загрузка датасета

In [96]:
titanic_df = pd.read_csv('titanic.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


#3. Oценка данных

In [97]:
# выведем размерность нашего датасета:
titanic_df.shape

(891, 12)

In [98]:
# выведем типы данных:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [99]:
# выведем основные статистические показатели:
titanic_df.describe().round(3)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.384,2.309,29.699,0.523,0.382,32.204
std,257.354,0.487,0.836,14.526,1.103,0.806,49.693
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.454
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.329


#4. Feature Engineering:

##4.1. Обработка категориальных признаков и создание новых признаков из уже существующих

In [100]:
#Создаем столбец с фамилией:
titanic_df["Family"] = titanic_df["Name"].apply(lambda s: s.split(",")[0])
titanic_df.reset_index(drop=True,inplace=True)
titanic_df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Braund
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Cumings
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Heikkinen
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Futrelle
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Allen


In [101]:
print('Family: ', titanic_df.Family.unique(),'\n',
    'Embarked: ', titanic_df.Embarked.unique(),'\n',
      'Sex: ', titanic_df.Sex.unique(),'\n',
      'Age: ', titanic_df.Age.unique())


Family:  ['Braund' 'Cumings' 'Heikkinen' 'Futrelle' 'Allen' 'Moran' 'McCarthy'
 'Palsson' 'Johnson' 'Nasser' 'Sandstrom' 'Bonnell' 'Saundercock'
 'Andersson' 'Vestrom' 'Hewlett' 'Rice' 'Williams' 'Vander Planke'
 'Masselmani' 'Fynney' 'Beesley' 'McGowan' 'Sloper' 'Asplund' 'Emir'
 'Fortune' "O'Dwyer" 'Todoroff' 'Uruchurtu' 'Spencer' 'Glynn' 'Wheadon'
 'Meyer' 'Holverson' 'Mamee' 'Cann' 'Nicola-Yarred' 'Ahlin' 'Turpin'
 'Kraeff' 'Laroche' 'Devaney' 'Rogers' 'Lennon' "O'Driscoll" 'Samaan'
 'Arnold-Franchi' 'Panula' 'Nosworthy' 'Harper' 'Faunthorpe' 'Ostby'
 'Woolner' 'Rugg' 'Novel' 'West' 'Goodwin' 'Sirayanian' 'Icard' 'Harris'
 'Skoog' 'Stewart' 'Moubarek' 'Nye' 'Crease' 'Kink' 'Jenkin' 'Hood'
 'Chronopoulos' 'Bing' 'Moen' 'Staneff' 'Moutal' 'Caldwell' 'Dowdell'
 'Waelens' 'Sheerlinck' 'McDermott' 'Carrau' 'Ilett' 'Backstrom' 'Ford'
 'Slocovski' 'Celotti' 'Christmann' 'Andreasson' 'Chaffee' 'Dean' 'Coxon'
 'Shorney' 'Goldschmidt' 'Greenfield' 'Doling' 'Kantor' 'Petranec'
 'Petroff' 'Whi

In [102]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()

def LabelEncoder (data, feature):
  data[feature] = label_encoder.fit_transform(data[feature])

  return(data)


In [103]:
from sklearn.preprocessing import OneHotEncoder

def One_Hot_Encoder (data, feature):
  encoded_columns = pd.get_dummies(data[feature])
  return(encoded_columns)

In [104]:

# Поработаем с признаками 'SibSp' (Кол-во братьев/сестер или супругов, путешествующих с каждым пассажиром) и 'Parch' (Number of parents of children travelling with each passenger): создадим общий признак, размерности семьи:
titanic_df['FamilySize'] = titanic_df['SibSp'] + titanic_df['Parch']
titanic_df[['FamilySize', 'Survived']].groupby(['FamilySize'],).mean().sort_values(by='FamilySize', ascending=False)

titanic_df["Alone"] = titanic_df['FamilySize'] == 0 # Создадим признак, показывающий, кто был в поездке один, без семьи
titanic_df[["Alone", "Survived"]].groupby(['Alone'],).mean().sort_values(by='Alone', ascending=False)
LabelEncoder(titanic_df, 'Alone')# применим кодирование


# Поработаем с признаком 'Fare': создадим признак, показывающий кто к какому классу относится по билетам (билет business-class/ билет middle-class/ билет low-class):
titanic_df['CategoricalFare'] = pd.qcut(titanic_df['Fare'], 4, precision=0)
titanic_df[["CategoricalFare", "Survived"]].groupby(['CategoricalFare'],).mean().sort_values(by='CategoricalFare', ascending=False)

# titanic_df['CategoricalFare'] = titanic_df['CategoricalFare'].apply(rightValue) # установим интервалы

LabelEncoder(titanic_df, 'CategoricalFare')# применим кодирование


# для Ticket, 'Cabin' воспользуемся кодированием через Label Encoder:
LabelEncoder(titanic_df, 'Ticket')
LabelEncoder(titanic_df, 'Cabin')


# Поработаем с признаком 'Embarked' (порт отправления данного пассажира):
titanic_df['Embarked'] = titanic_df['Embarked'].fillna('S') # заполним пустые значения 'S'
titanic_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()
LabelEncoder(titanic_df, 'Embarked') # применим кодирование


drop_elements = ['Fare', 'Ticket', 'Cabin', 'SibSp', 'Parch', 'FamilySize']

titanic_df = titanic_df.drop(drop_elements, axis = 1)

titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,Family,Alone,CategoricalFare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,2,Braund,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,0,Cumings,0,3
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,2,Heikkinen,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,2,Futrelle,0,3
4,5,0,3,"Allen, Mr. William Henry",male,35.0,2,Allen,1,1
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,2,Montvila,1,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,2,Graham,1,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,2,Johnston,0,2
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,Behr,1,2


#5. Задания:

##A. Сколько пассажиров выжило, а сколько - нет?

In [None]:
# Survived: Признак, показывающий был ли спасен данный пассажир или нет. 1 означает, что удалось выжить, и 0 - не удалось спастись.
print("Выжило:", titanic_df['Survived'].eq(1).sum(),
      "Умерло:", titanic_df['Survived'].eq(0).sum())

Выжило: 342 Умерло: 549


##B. Создайте столбец "IsChild", который равен 1, если возраст меньше 20, и 0 иначе. Для пропущенных значений поведение функции может быть произвольным.

In [None]:
# Fill missing values in age with random data based on mean and standard variation:
age_avg = titanic_df['Age'].mean()
age_std = titanic_df['Age'].std()
age_null_count = titanic_df['Age'].isnull().sum()
age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size = age_null_count)
titanic_df['Age'][np.isnan(titanic_df['Age'])] = age_null_random_list

titanic_df['IsChild'] = np.where(titanic_df['Age'] <= 20, 1, 0)


titanic_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df['Age'][np.isnan(titanic_df['Age'])] = age_null_random_list


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,Embarked,Family,Alone,CategoricalFare,IsChild
0,1,0,3,"Braund, Mr. Owen Harris",22.0,2,Braund,0,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,0,Cumings,0,3,0
2,3,1,3,"Heikkinen, Miss. Laina",26.0,2,Heikkinen,1,1,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,2,Futrelle,0,3,0
4,5,0,3,"Allen, Mr. William Henry",35.0,2,Allen,1,1,0
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",27.0,2,Montvila,1,1,0
887,888,1,1,"Graham, Miss. Margaret Edith",19.0,2,Graham,1,2,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",32.0,2,Johnston,0,2,0
889,890,1,1,"Behr, Mr. Karl Howell",26.0,0,Behr,1,2,0


##C. Какова доля семей, в которых минимальный возраст меньше 20 (семьи с детьми)?

In [None]:
# сортируем по одинаковой фамилии (одна семья)
# сортируем по возрасту (диапазон от 0 до 20)
# считаем какую долю составляют эти семьи по отношению ко всем семьям

print(titanic_df[titanic_df['Age'] <= 20]['Family'].count())

217


##D. Какова доля выживших пассажиров из класса 3? А пассажиров из класса 1?

In [73]:
survived_df = titanic_df[['Pclass', 'Survived']][titanic_df['Survived'] == 1].groupby('Pclass', as_index=False).count()
survived_df['count'] = titanic_df.groupby('Pclass', as_index=False).agg({'Pclass': 'count'})
survived_df['survived_in_class'] = survived_df['Survived'] / survived_df['count']
survived_df

Unnamed: 0,Pclass,Survived,count,survived_in_class
0,1,136,216,0.62963
1,2,87,184,0.472826
2,3,119,491,0.242363


##E. Какова доля выживших женщин из первого класса? А доля выживших мужчин из 3 класса?

In [112]:
# Поработаем с признаком 'Sex' (пол):
LabelEncoder(titanic_df, 'Sex')
titanic_df["Sex"] = titanic_df['Sex'].astype('int')

titanic_df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Embarked,Family,Alone,CategoricalFare
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,2,Braund,0,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,0,Cumings,0,3
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,2,Heikkinen,1,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,2,Futrelle,0,3
4,5,0,3,"Allen, Mr. William Henry",1,35.0,2,Allen,1,1
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.0,2,Montvila,1,1
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.0,2,Graham,1,2
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,,2,Johnston,0,2
889,890,1,1,"Behr, Mr. Karl Howell",1,26.0,0,Behr,1,2


In [113]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Name             891 non-null    object 
 4   Sex              891 non-null    int64  
 5   Age              714 non-null    float64
 6   Embarked         891 non-null    int64  
 7   Family           891 non-null    object 
 8   Alone            891 non-null    int64  
 9   CategoricalFare  891 non-null    int64  
dtypes: float64(1), int64(7), object(2)
memory usage: 69.7+ KB


In [118]:
# 0 - женщина
# 1 - мужчина

survived_df = titanic_df[['Pclass', 'Survived']][(titanic_df['Survived'] == 1)& (titanic_df['Sex'] == 0) & (titanic_df["Pclass"] == 1)].groupby('Pclass', as_index=False).count()
survived_df['count'] = titanic_df[(titanic_df['Sex'] == 0) & (titanic_df["Pclass"] == 1)].groupby('Pclass', as_index=False).agg({'Pclass': 'count'})
survived_df['survived_in_class'] = survived_df['Survived'] / survived_df['count']

survived_df

Unnamed: 0,Pclass,Survived,count,survived_in_class
0,1,91,94,0.968085


In [119]:
survived_df = titanic_df[['Pclass', 'Survived']][(titanic_df['Survived'] == 1)& (titanic_df['Sex'] == 1) & (titanic_df["Pclass"] == 3)].groupby('Pclass', as_index=False).count()
survived_df['count'] = titanic_df[(titanic_df['Sex'] == 1) & (titanic_df["Pclass"] == 3)].groupby('Pclass', as_index=False).agg({'Pclass': 'count'})
survived_df['survived_in_class'] = survived_df['Survived'] / survived_df['count']

survived_df

Unnamed: 0,Pclass,Survived,count,survived_in_class
0,3,47,347,0.135447
