## Import Libraries

In [1]:
import pandas as pd

## Obtain the Raw Data

In [2]:
URL_TITANIC = 'https://raw.githubusercontent.com/dushyantkhosla/tiny-datasets/master/titanic.csv'

In [6]:
df_raw = pd.read_csv(URL_TITANIC)

## Take a backup

In [7]:
df_raw.to_csv("../data/00-raw/titanic_0.csv")

## Inspect the Dataset

- Number of rows, columns
- Column Names
- Data Types
- Missing Values etc.

In [24]:
df_raw.head()

Unnamed: 0,passenger class,survived,name,AGE,Siblings/Spouses Aboard,par&ch,ticket,fare,cabin Number,embarked
0,1,1,"Allen, Miss. Elisabeth Walton",29.0,0,0,24160,211.3375,B5,S
1,1,1,"Allison, Master. Hudson Trevor",0.9167,1,2,113781,151.55,C22 C26,S
2,1,0,"Allison, Miss. Helen Loraine",2.0,1,2,113781,151.55,C22 C26,S
3,1,0,"Allison, Mr. Hudson Joshua Creighton",30.0,1,2,113781,151.55,C22 C26,S
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",25.0,1,2,113781,151.55,C22 C26,S


In [8]:
df_raw.shape

(1309, 10)

In [12]:
df_raw.columns.tolist()

['passenger class',
 'survived',
 'name',
 'AGE',
 'Siblings/Spouses Aboard',
 'par&ch',
 'ticket',
 'fare',
 'cabin Number',
 'embarked']

In [21]:
df_raw.duplicated().value_counts()

False    1309
dtype: int64

In [15]:
df_raw.isnull().mean().round(2)

passenger class            0.00
survived                   0.00
name                       0.00
AGE                        0.20
Siblings/Spouses Aboard    0.00
par&ch                     0.00
ticket                     0.00
fare                       0.00
cabin Number               0.77
embarked                   0.00
dtype: float64

In [19]:
df_raw.dtypes

passenger class              int64
survived                     int64
name                        object
AGE                        float64
Siblings/Spouses Aboard      int64
par&ch                       int64
ticket                      object
fare                       float64
cabin Number                object
embarked                    object
dtype: object

In [22]:
df_raw.describe()

Unnamed: 0,passenger class,survived,AGE,Siblings/Spouses Aboard,par&ch,fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,2.294882,0.381971,29.881135,0.498854,0.385027,33.295479
std,0.837836,0.486055,14.4135,1.041658,0.86556,51.758668
min,1.0,0.0,0.1667,0.0,0.0,0.0
25%,2.0,0.0,21.0,0.0,0.0,7.8958
50%,3.0,0.0,28.0,0.0,0.0,14.4542
75%,3.0,1.0,39.0,1.0,0.0,31.275
max,3.0,1.0,80.0,8.0,9.0,512.3292


In [23]:
df_raw.describe(include=['O'])

Unnamed: 0,name,ticket,cabin Number,embarked
count,1309,1309,295,1307
unique,1307,929,186,3
top,"Kelly, Mr. James",CA. 2343,C23 C25 C27,S
freq,2,11,6,914


## Takeaways

- Column names need to be standardized


- There are no **duplicates**


- There are no `date` columns


- There are some `categoricals`
    - all except _embarked_ have too many unique values
    - we might need to create **dummies**


- There are some `numerics`
    - there are no apparent outliers
    - there are some missings


- There is an `id` variable
    - these have no predictive power, consider dropping it