## Import Package

Import Package yang diperlukan untuk : \
Data Split \
Data Transforming \
Data Cleaning \
Encoding

In [323]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

## Membaca Dataset

In [324]:
df = pd.read_csv('vgsales.csv')

df.head()

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1.0,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2.0,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3.0,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4.0,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.0
4,5.0,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.0,31.37


## Data Split

In [325]:
df.shape

(16604, 11)

In [326]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [327]:

print("Dimensi X_train : ", X_train.shape)
print("Dimensi X_test  : ", X_test.shape)
print("Dimensi y_train : ", y_train.shape)
print("Dimensi y_test  : ", y_test.shape)


Dimensi X_train :  (11622, 10)
Dimensi X_test  :  (4982, 10)
Dimensi y_train :  (11622,)
Dimensi y_test  :  (4982,)


## Membuat Copy Dataset

In [328]:
dnormalisasi = df.copy() 
dstandarisasi = df.copy() 


## Normalisasi

In [329]:
min_max_scaler = MinMaxScaler()

In [330]:
normalisasi = min_max_scaler.fit_transform(dnormalisasi[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']])

hasil_normalisasi = pd.DataFrame(normalisasi)
hasil_normalisasi.rename(columns={0 : 'NA_Sales', 1 : 'EU_Sales', 2 : 'JP_Sales', 3 : 'Global_Sales', 4 : 'Other_Sales'}, inplace=True)

hasil_normalisasi.head()

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Global_Sales,Other_Sales
0,1.0,1.0,0.368885,0.800378,1.0
1,0.700892,0.123363,0.666341,0.072848,0.486281
2,0.38202,0.443832,0.370841,0.31315,0.432854
3,0.37961,0.379394,0.320939,0.280038,0.398767
4,0.271632,0.30634,1.0,0.094607,0.379064


## Standarisasi

In [331]:
standar_scaler = StandardScaler()

In [332]:
standarisasi = standar_scaler.fit_transform(dstandarisasi[['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']])

np.std(standarisasi)

hasil_standarisasi = pd.DataFrame(standarisasi)
hasil_standarisasi.rename(columns={0 : 'NA_Sales', 1 : 'EU_Sales', 2 : 'JP_Sales', 3 : 'Global_Sales', 4 : 'Other_Sales'}, inplace=True)

hasil_standarisasi.head()

Unnamed: 0,NA_Sales,EU_Sales,JP_Sales,Global_Sales,Other_Sales
0,50.482317,55.615147,11.94017,44.513897,52.646713
1,35.285606,6.609385,21.771072,3.819155,25.426638
2,19.08476,24.524227,12.004847,17.260592,22.595751
3,18.962305,20.921995,10.355583,15.408425,20.789619
4,13.476304,16.838182,32.798499,5.036293,19.745649


## Data Cleaning

In [333]:
dcleaning = df.copy()

Menampilkan nilai NULL pada dataset

In [334]:
dcleaning.isna().sum()

Rank              3
Name              0
Platform          0
Year            271
Genre             0
Publisher        58
NA_Sales          0
EU_Sales          0
JP_Sales          0
Other_Sales       0
Global_Sales      0
dtype: int64

Tipe Data Attribute di Dataset

In [335]:
dcleaning.dtypes

Rank            float64
Name             object
Platform         object
Year            float64
Genre            object
Publisher        object
NA_Sales        float64
EU_Sales        float64
JP_Sales        float64
Other_Sales     float64
Global_Sales    float64
dtype: object

Menghapus Nilai NULL pada Dataset

In [336]:
impute_mean = SimpleImputer(strategy='mean')
impute_median = SimpleImputer(strategy='median')
impute_modus = SimpleImputer(strategy='most_frequent' )

In [342]:
dcleaning['Rank'] = impute_median.fit_transform(dcleaning[['Rank']])
dcleaning['Year'] = impute_mean.fit_transform(dcleaning[['Year']])
dcleaning['Publisher'] = impute_modus.fit_transform(dcleaning[['Publisher']])

print('Jumlah Nilai Null Setelah Dihapus :')
dcleaning.isna().sum()

Jumlah Nilai Null Setelah Dihapus :


Rank            0
Name            0
Platform        0
Year            0
Genre           0
Publisher       0
NA_Sales        0
EU_Sales        0
JP_Sales        0
Other_Sales     0
Global_Sales    0
dtype: int64

Menghapus Data Duplikat pada Dataset

In [343]:
print('Jumlah Data Duplikat Sebelum Dihapus :',dcleaning.duplicated().sum())

Jumlah Data Duplikat Sebelum Dihapus : 5


In [344]:
dcleaning.drop_duplicates(inplace=True)
print('Jumlah Data Duplikat Setelah Dihapus :',dcleaning.duplicated().sum())

Jumlah Data Duplikat Setelah Dihapus : 0


Mengubah Tipe Data pada Salah Satu Attribute

In [345]:
print('Sebelum Tipe Data Year Diubah :')
dcleaning["Year"].dtypes

Sebelum Tipe Data Year Diubah :


dtype('float64')

In [347]:
print('Setelah Tipe Data Year Diubah :')
dcleaning['Year'] = dcleaning['Year'].astype('int64')
dcleaning["Year"].dtypes

Setelah Tipe Data Year Diubah :


dtype('int64')

## Encoding

Menggunakan Metode One Hot Encoding

In [None]:
encoder = OneHotEncoder(sparse=False)

genreEnc = encoder.fit_transform(df[['Genre']])
genreEnc = pd.DataFrame(genreEnc)

dhot = df.join(genreEnc, how='left', lsuffix='left' )

#fungsi dibawah tidak menampilkan attribute 
dhot.loc[:, ~dhot.columns.isin(["Platform","NA_Sales","EU_Sales","JP_Sales","Other_Sales","Global_Sales","Publisher","Rank"])]

Unnamed: 0,Name,Year,Genre,0,1,2,3,4,5,6,7,8,9,10,11
0,Wii Sports,2006.0,Sports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Super Mario Bros.,1985.0,Platform,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Mario Kart Wii,2008.0,Racing,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Wii Sports Resort,2009.0,Sports,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Pokemon Red/Pokemon Blue,1996.0,Role-Playing,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16599,Woody Woodpecker in Crazy Castle 5,2002.0,Platform,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16600,Men in Black II: Alien Escape,2003.0,Shooter,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
16601,SCORE International Baja 1000: The Official Game,2008.0,Racing,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
16602,Know How 2,2010.0,Puzzle,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
