<a href="https://colab.research.google.com/github/ergonrizky26/habis-kerja-data-science-practice/blob/DS001/%5BModul_5_Habis_Kerja%5D_python_data_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modul 5 - Membahas mengenai Modelling dengan Python
Berikut ini adalah daftar isi dari Notebook ini :
1. Data Preparation
2. Finding Data Insight
3. Data Manipulation
4. Modelling Preparation
5. Modelling
6. Modelling Evaluation

## 1. Data Preparation

### Import Packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# import sklearn libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

### Data Overview

In [2]:
data_path = os.path.join("drive","MyDrive","Titanic", "train.csv")
print(data_path)

drive/MyDrive/Titanic/train.csv


In [3]:
data = pd.read_csv(data_path)

In [4]:
# Check jumlah baris (records) dan jumlah kolom
data.shape # Outputnya adalah (baris, kolom)

(891, 12)

**Keterangan data : **
1. PassengerId : id dari records/penumpang
2. Survived : Catatan bahwa dia selamat atau meninggal(1=selamat/0=meinggal)
3. Pclass : Kelas dari penumpang
4. Name : Nama penumpang
5. Sex : Jenis kelamin penumpang
6. Age : Usia penumpang
7. SibSp : Jumlah Siblings/Spouse (saudara atau pasangan)
8. Parch : Jumlah Parent/Children (Orangtua atau anak)
9. Ticket : Nomor Tiket
10. Fare : Harga Tiket
11. Cabin : Posisi duduk/kabin penumpang
12. Embarked : Pelabuhan tempat penumpang berangkat

In [5]:
# Melihat overview data (top 5 teratas)
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Melihat overview data (top 5 terbawah)
data.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [7]:
# Melihat overview data (random 5 data)
data.sample(5, random_state=91) # random_state agar ketika di run datanya tidak berubah-ubah

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
535,536,1,2,"Hart, Miss. Eva Miriam",female,7.0,0,2,F.C.C. 13529,26.25,,S
561,562,0,3,"Sivic, Mr. Husein",male,40.0,0,0,349251,7.8958,,S
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C
636,637,0,3,"Leinonen, Mr. Antti Gustaf",male,32.0,0,0,STON/O 2. 3101292,7.925,,S
34,35,0,1,"Meyer, Mr. Edgar Joseph",male,28.0,1,0,PC 17604,82.1708,,C


In [8]:
# Melihat tipe data dari masing-masing kolom dan jumlah baris/records yang kosong
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Data Quality Check

In [9]:
# Check nilai-nilai yang ada di suatu kolom
# data['nama_kolom'].unique()
data["Embarked"].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
# Melihat persebaran data (min, max, quantile)
# data.describe() -> untuk data numeric
data.describe(include=['O']) # untuk data categoric

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Braund, Mr. Owen Harris",male,347082,B96 B98,S
freq,1,577,7,4,644


In [11]:
# Melihat apakah ada kolom yang nilainya kosong
data.isnull().sum() # Melihat jumlah variabel yang kosong

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [12]:
data.isna().mean() * 100 # Persentase baris yang kosong dari keseluruhan

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

### Handling Missing values

In [13]:
# Drop records atau baris
data_dropped = data.dropna()
data_dropped.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [14]:
data.shape, data_dropped.shape

((891, 12), (183, 12))

In [15]:
# Drop variable atau kolom
data_dropped = data.drop(columns=['Cabin'], axis=1)
data_dropped.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [16]:
data.shape, data_dropped.shape

((891, 12), (891, 11))

In [17]:
# Mengisi records atau baris
# Data_dropped = data_dropped.fillna(0)
data_dropped['Embarked_filled'] = data_dropped['Embarked'].fillna('Kosong') # Atau missing

In [18]:
data_dropped.loc[data_dropped['Embarked'].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Embarked_filled
61,62,1,1,"Icard, Miss. Amelie",female,38.0,0,0,113572,80.0,,Kosong
829,830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62.0,0,0,113572,80.0,,Kosong


In [19]:
data_dropped['Age'] = data_dropped['Age'].fillna(method='ffill')
data_dropped['Age'] = data_dropped['Age'].fillna(method='bfill')
data_dropped['Age'] = data_dropped['Age'].fillna(data_dropped['Age'].mean())

data_dropped['Embarked'] = data_dropped['Embarked'].fillna(data_dropped['Embarked'].value_counts().idxmax())

In [20]:
data_dropped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PassengerId      891 non-null    int64  
 1   Survived         891 non-null    int64  
 2   Pclass           891 non-null    int64  
 3   Name             891 non-null    object 
 4   Sex              891 non-null    object 
 5   Age              891 non-null    float64
 6   SibSp            891 non-null    int64  
 7   Parch            891 non-null    int64  
 8   Ticket           891 non-null    object 
 9   Fare             891 non-null    float64
 10  Embarked         891 non-null    object 
 11  Embarked_filled  891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


### Handling Duplicated Values

In [21]:
data = data.append(data.loc[1], ignore_index=True)

  data = data.append(data.loc[1], ignore_index=True)


In [22]:
# Check baris yang duplikat
data.loc[data['Name'].duplicated()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
891,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [23]:
# Membuang baris duplikat semuanya
data.drop_duplicates(keep=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [24]:
# Membuang baris duplikat semua, kecuali paling atas
data.drop_duplicates(keep='first')

# Membuang baris duplikat semua, kecuali paling bawah
data.drop_duplicates(keep='last')

# Membuang baris duplikat berdasarkan beberapa kolom
data.drop_duplicates(subset=['Name','Sex'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


## 2. Finding Data Insight

In [25]:
# aggregat data berdasarkan suatu kolom
# data.groupby('Kelas).mean()[['Nilai]]
data.groupby('Embarked').mean()[['Age']]

  data.groupby('Embarked').mean()[['Age']]


Unnamed: 0_level_0,Age
Embarked,Unnamed: 1_level_1
C,30.869618
Q,28.089286
S,29.445397


In [27]:
# menghitung hubungan antara dua variable
# pd.crosstab(data['Kelas'], data['Ikut Bimbel'])
pd.crosstab(data['Sex'], data['Embarked'], normalize=True) # normalize value data/mengetahui persentasi suatu data
# pd.crosstab(data['Sex'], data['Embarked'])

Embarked,C,Q,S
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.083146,0.040449,0.22809
male,0.106742,0.046067,0.495506


## 3. Data Manipulation