<a href="https://colab.research.google.com/github/ergonrizky26/data-loading/blob/main/Data_Loading.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Loading

In [50]:
# Import Libraries

import sys
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None, 'display.precision', 2)

In [51]:
# Check Version

print(f'Python:', sys.version[0:3])
print(f'Numpy:', np.__version__)
print(f'Pandas:', pd.__version__)
print(f'Matplotlib:', matplotlib.__version__)

Python: 3.7
Numpy: 1.21.6
Pandas: 1.3.5
Matplotlib: 3.2.2


### Read Data

In [52]:
# Import Data

df = pd.read_csv('titanic.csv')

In [53]:
# Read 5 First Data

df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.28
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.92
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [54]:
# Read 5 Last Data

df.tail()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
882,0,2,Rev. Juozas Montvila,male,27.0,0,0,13.0
883,1,1,Miss. Margaret Edith Graham,female,19.0,0,0,30.0
884,0,3,Miss. Catherine Helen Johnston,female,7.0,1,2,23.45
885,1,1,Mr. Karl Howell Behr,male,26.0,0,0,30.0
886,0,3,Mr. Patrick Dooley,male,32.0,0,0,7.75


In [55]:
# Read 5 Sample Data

df.sample(5)

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
73,1,3,Mr. Lee Bing,male,32.0,0,0,56.5
420,0,3,Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)...,female,28.0,1,1,14.4
100,0,3,Mr. Pastcho Petroff,male,29.0,0,0,7.9
16,0,3,Master. Eugene Rice,male,2.0,4,1,29.12
816,1,1,Mrs. Charles Melville (Clara Jennings Gregg) Hays,female,52.0,1,1,93.5


In [56]:
# Check Number of Rows & Features

print('Total Rows :', df.shape[0])
print('Total Features :', df.shape[1])

Total Rows : 887
Total Features : 8


#### Basic Exploration

In [57]:
# Create Data Info

list_item = []
for col in df.columns:
  list_item.append([col, df[col].dtype, df[col].isna().sum(), 100*df[col].isna().sum()/len(df[col]), df[col].nunique(), df[col].unique()[:4]])
desc_df = pd.DataFrame(data=list_item, columns='Feature,Data Type,Null Num,Null %,Unique Num,Unique Sample'.split(','))
desc_df

Unnamed: 0,Feature,Data Type,Null Num,Null %,Unique Num,Unique Sample
0,Survived,int64,0,0.0,2,"[0, 1]"
1,Pclass,int64,0,0.0,3,"[3, 1, 2]"
2,Name,object,0,0.0,887,"[Mr. Owen Harris Braund, Mrs. John Bradley (Fl..."
3,Sex,object,0,0.0,2,"[male, female]"
4,Age,float64,0,0.0,89,"[22.0, 38.0, 26.0, 35.0]"
5,Siblings/Spouses Aboard,int64,0,0.0,7,"[1, 0, 3, 4]"
6,Parents/Children Aboard,int64,0,0.0,7,"[0, 1, 2, 5]"
7,Fare,float64,0,0.0,248,"[7.25, 71.2833, 7.925, 53.1]"


In [58]:
# Check Duplicate Values

print('Total Duplicated Values :', df.duplicated().sum())

Total Duplicated Values : 0


In [59]:
# Change Features Name

new_columns = {
    'Survived' : 'SURVIVED',
    'Pclass' : 'PCLASS',
    'Name' : 'NAME',
    'Sex' : 'GENDER',
    'Age' : 'AGE',
    'Siblings/Spouses Aboard' : 'SIBSA',
    'Parents/Children Aboard' : 'PARCA',
    'Fare' : 'PRICE'
}

df = df.rename(columns = new_columns)

In [60]:
df.head()

Unnamed: 0,SURVIVED,PCLASS,NAME,GENDER,AGE,SIBSA,PARCA,PRICE
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.28
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.92
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [61]:
# Count Target Feature Values

df['SURVIVED'].value_counts()

0    545
1    342
Name: SURVIVED, dtype: int64

In [62]:
# Count Normalize Target Feature Values

df['SURVIVED'].value_counts(normalize = True) * 100

0    61.44
1    38.56
Name: SURVIVED, dtype: float64

#### Descriptive Statistics

In [63]:
# Check Numerical Features Descriptive Statistic

df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
SURVIVED,887.0,0.39,0.49,0.0,0.0,0.0,1.0,1.0
PCLASS,887.0,2.31,0.84,1.0,2.0,3.0,3.0,3.0
AGE,887.0,29.47,14.12,0.42,20.25,28.0,38.0,80.0
SIBSA,887.0,0.53,1.1,0.0,0.0,0.0,1.0,8.0
PARCA,887.0,0.38,0.81,0.0,0.0,0.0,0.0,6.0
PRICE,887.0,32.31,49.78,0.0,7.92,14.45,31.14,512.33


In [64]:
# Check Categorical Features Descriptive Statistic

df.describe(exclude=[np.number]).T

Unnamed: 0,count,unique,top,freq
NAME,887,887,Mr. Owen Harris Braund,1
GENDER,887,2,male,573


In [65]:
# Save Data

df.to_csv('new_titanic.csv', index = False)