In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('titanic-data.csv')

In [5]:
 data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [7]:
data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [9]:
data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Explore the Columns

In [12]:
#Passengers who survived vs not survived

In [14]:
data['Survived'].value_counts()

Survived
0    549
1    342
Name: count, dtype: int64

In [16]:
print('Total number of passengers in the training data...',len(data))
print('Number of passengers who survived...',len(data[data['Survived']==1]))
print("Number of passengers who didn't survived...", len(data[data['Survived']==0]))

Total number of passengers in the training data... 891
Number of passengers who survived... 342
Number of passengers who didn't survived... 549


In [18]:
data['Sex'].value_counts()

Sex
male      577
female    314
Name: count, dtype: int64

In [24]:
#What is the % of men and women who survived?
print("% of male who survived",100*np.mean(data['Survived'][data['Sex'] == 'male']))
print("% of female who survived",100*np.mean(data['Survived'][data['Sex'] == 'female']))

% of male who survived 18.890814558058924
% of female who survived 74.20382165605095


In [4]:
data['Pclass'].value_counts()

Pclass
3    491
1    216
2    184
Name: count, dtype: int64

In [5]:
print("% of passengers who survived in first class",100*np.mean(data['Survived'][data['Pclass']==1]))
print("% of passengers who survived in first class",100*np.mean(data['Survived'][data['Pclass']==2]))
print("% of passengers who survived in first class",100*np.mean(data['Survived'][data['Pclass']==3]))

% of passengers who survived in first class 62.96296296296296
% of passengers who survived in first class 47.28260869565217
% of passengers who survived in first class 24.236252545824847


In [7]:
#data[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean()
data[["Pclass","Survived"]].groupby(["Pclass"], as_index = False).mean()

Unnamed: 0,Pclass,Survived
0,1,0.62963
1,2,0.472826
2,3,0.242363


## Summary

In [8]:
data.shape

(891, 12)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [10]:
data['Age'].value_counts()

Age
24.00    30
22.00    27
18.00    26
19.00    25
28.00    25
         ..
36.50     1
55.50     1
0.92      1
23.50     1
74.00     1
Name: count, Length: 88, dtype: int64

In [12]:
data["Cabin"]     #we have to deal with NA values

0       NaN
1       C85
2       NaN
3      C123
4       NaN
       ... 
886     NaN
887     B42
888     NaN
889    C148
890     NaN
Name: Cabin, Length: 891, dtype: object

In [14]:
data["Sex"]

0        male
1      female
2      female
3      female
4        male
        ...  
886      male
887    female
888    female
889      male
890      male
Name: Sex, Length: 891, dtype: object

In [13]:
#converting categorical values into numeric
df2 = data.copy()
df2["Sex"] = data["Sex"].apply(lambda x:1 if x == "male" else 0)
df2["Sex"]

0      1
1      0
2      0
3      0
4      1
      ..
886    1
887    0
888    0
889    1
890    1
Name: Sex, Length: 891, dtype: int64

## Dealing with Missing Values

In [15]:
df2 = data.copy() #dataframe copy
df2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [16]:
int(data["Age"].mean())

29

In [17]:
df2["Age"] = df2["Age"].fillna(np.mean(df2["Age"]))
df2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [18]:
df2.Embarked.value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [19]:
# we can drop the null values in Embarked columns
# we can fill them with mode of the column i.e. S

In [20]:
df2["Embarked"].mode()

0    S
Name: Embarked, dtype: object

In [22]:
df2["Embarked"].fillna(df2["Embarked"].mode()[0],inplace=True)

In [24]:
df2.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [26]:
df2["Cabin"].value_counts()

Cabin
B96 B98        4
G6             4
C23 C25 C27    4
C22 C26        3
F33            3
              ..
E34            1
C7             1
C54            1
E36            1
C148           1
Name: count, Length: 147, dtype: int64

In [28]:
df2["Cabin"].mode()

0        B96 B98
1    C23 C25 C27
2             G6
Name: Cabin, dtype: object

In [30]:
df2["Cabin"].fillna(df2["Cabin"].mode()[0],inplace=True)

In [31]:
df2.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [35]:
df2.corr

<bound method DataFrame.corr of      PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex        Age  \
0                              Braund, Mr. Owen Harris    male  22.000000   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.000000   
2                               Heikkinen, Miss. Laina  female  26.000000   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.000000   
4                             Allen, Mr. William Henry    male  35.000000   
..                                               