In [1]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## read_csv takes an encoding option to deal with files in different formats

In [2]:
df=pd.read_csv("Titanic.csv",encoding = "ISO-8859-1") #"utf-8"

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
df.Survived.unique()

array([0, 1])

### std : Standard deviation of the obersvations.

In [5]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### We can notice that Age Count is less than 891 ==> there's null values 

In [6]:
df.Age.isnull().any()

True

In [7]:
df.Parch.isnull().any()

False

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


# describe

In [9]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [10]:
df.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Salkjelsvik, Miss. Anna Kristine",male,347082,B96 B98,S
freq,1,577,7,4,644


In [11]:
df[df["Name"]=="Moor, Mrs. (Beila)"]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
823,824,1,3,"Moor, Mrs. (Beila)",female,27.0,0,1,392096,12.475,E121,S


# describe test

In [12]:
df_test = pd.DataFrame({'categorical': pd.Categorical(['dead','alive','sick',"jawou behi"]),
                    'numeric': [1,0,2,3],
                     'object': ['d','a', 'a', 'c'],
                  })

In [13]:
df_test

Unnamed: 0,categorical,numeric,object
0,dead,1,d
1,alive,0,a
2,sick,2,a
3,jawou behi,3,c


In [14]:
df_test.describe() #only numerical 

Unnamed: 0,numeric
count,4.0
mean,1.5
std,1.290994
min,0.0
25%,0.75
50%,1.5
75%,2.25
max,3.0


### The top is the most common value. The freq is the most common value’s frequency

In [15]:
df_test.describe(include=['O'])# will limit the function to columns of type object only

Unnamed: 0,object
count,4
unique,3
top,a
freq,2


In [16]:
df_test.describe(include=['category'])

Unnamed: 0,categorical
count,4
unique,4
top,sick
freq,1


# Drop useless columns

In [17]:
df = df.drop(['Name', 'Ticket', 'Cabin'], axis=1)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


# Drop null values

In [18]:
df = df.dropna()

# groupby

In [19]:
df.Pclass.unique()

array([3, 1, 2])

### the sum value of survivals for each Pclass

In [20]:
df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).sum()

Unnamed: 0,Pclass,Survived
0,1,120
1,2,83
2,3,85


In [21]:
a=df[df["Pclass"]==1]
a[a["Survived"]==1].shape[0]

120

### male or female survived more?

In [22]:
df[["Sex","Survived"]].groupby(["Sex"],as_index=True).sum()

Unnamed: 0_level_0,Survived
Sex,Unnamed: 1_level_1
female,195
male,93


# Evaluating columns