# Data Preprocessing

#### Import Library yang digunakan :

In [18]:
import pandas as pd
import sklearn as sk
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

#### Read dataset :

In [19]:
df = pd.read_csv("msft.csv")

df

Unnamed: 0,Name,Rating,No of people Rated,Category,Date,Price
0,Dynamic Reader,3.5,268,Books,07-01-2014,Free
1,"Chemistry, Organic Chemistry and Biochemistry-...",3.0,627,Books,08-01-2014,Free
2,BookViewer,3.5,593,Books,29-02-2016,Free
3,Brick Instructions,3.5,684,Books,30-01-2018,Free
4,Introduction to Python Programming by GoLearni...,2.0,634,Books,30-01-2018,Free
...,...,...,...,...,...,...
5318,MQTTSniffer,2.5,500,Developer Tools,10-04-2017,₹ 64.00
5319,"Dev Utils - JSON, CSV and XML",4.0,862,Developer Tools,18-11-2019,₹ 269.00
5320,Simply Text,4.0,386,Developer Tools,23-01-2014,₹ 219.00
5321,,,948,,,


#### Bagi dataset menjadi training set dan testing set dengan proporsi 70:30

In [20]:
data = df[['Rating','No of people Rated']]
label = df['Category']

data_train, data_test, label_train, label_test = train_test_split(data, label, test_size=.3)

#### Lakukan ***Data cleaning*** pada data dengan ***nilai null***
Ganti nilai null sesuai ketentuan. ***(bilangan bulat : median/modus, bilangan desimal : mean, tulisan : modus)***

***Cek Nilai Null***

In [21]:
df.isna().sum()

Name                  1
Rating                1
No of people Rated    0
Category              1
Date                  1
Price                 1
dtype: int64

In [22]:
sim  = SimpleImputer(strategy='mean')
simf = SimpleImputer(strategy='most_frequent')

In [23]:
df['Name'] = simf.fit_transform(df[['Name']])
df['Rating'] = sim.fit_transform(df[['Rating']])
df['Category'] = simf.fit_transform(df[['Category']])
df['Date'] = simf.fit_transform(df[['Date']])
df['Price'] = simf.fit_transform(df[['Price']])

df.isna().sum()

Name                  0
Rating                0
No of people Rated    0
Category              0
Date                  0
Price                 0
dtype: int64

#### Mempersiapkan Copy Dataset :

In [24]:
normalisasi = df.copy()
standarisasi = df.copy()

#### Lakukan ***normalisasi*** data pada salah satu attribute menggunakan ***Min Max scaler***

In [25]:
scaler = MinMaxScaler()

normalize_dataset = MinMaxScaler().fit_transform(normalisasi[['Rating','No of people Rated']])
normalize_dataset = pd.DataFrame(normalize_dataset)
normalize_dataset.rename(columns={0:'Rating',1:'No of People Rated'},inplace=True)
normalize_dataset.head(10)

Unnamed: 0,Rating,No of People Rated
0,0.625,0.186667
1,0.5,0.585556
2,0.625,0.547778
3,0.625,0.648889
4,0.25,0.593333
5,0.875,0.332222
6,0.875,0.517778
7,0.5,0.083333
8,0.75,0.4
9,0.625,0.292222


#### Lakukan ***standarisasi*** pada dataset 

In [26]:
std = StandardScaler()

std_dataset = std.fit_transform(standarisasi[['Rating','No of people Rated']])
print(std_dataset[:6])
print(f'Nilai standar deviasi: {np.std(std_dataset)}')

[[-0.2952848  -1.09261943]
 [-0.79961912  0.28931265]
 [-0.2952848   0.15843329]
 [-0.2952848   0.50872805]
 [-1.80828775  0.3162584 ]
 [ 0.71338383 -0.58834895]]
Nilai standar deviasi: 1.0


#### Lakukan ***Data cleaning*** pada data dengan ***nilai duplikat**

In [27]:
df.duplicated().sum()

1

In [29]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

### ***Ganti tipe data*** salah satu atribute angka

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5322 entries, 0 to 5321
Data columns (total 6 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Name                5322 non-null   object 
 1   Rating              5322 non-null   float64
 2   No of people Rated  5322 non-null   int64  
 3   Category            5322 non-null   object 
 4   Date                5322 non-null   object 
 5   Price               5322 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 291.0+ KB


In [31]:
df['No of people Rated'] = df['No of people Rated'].astype('float64')

df.head(10)

Unnamed: 0,Name,Rating,No of people Rated,Category,Date,Price
0,Dynamic Reader,3.5,268.0,Books,07-01-2014,Free
1,"Chemistry, Organic Chemistry and Biochemistry-...",3.0,627.0,Books,08-01-2014,Free
2,BookViewer,3.5,593.0,Books,29-02-2016,Free
3,Brick Instructions,3.5,684.0,Books,30-01-2018,Free
4,Introduction to Python Programming by GoLearni...,2.0,634.0,Books,30-01-2018,Free
5,Gurbani Reader,4.5,399.0,Books,18-01-2017,Free
6,NFO Viewer,4.5,566.0,Books,13-11-2012,Free
7,Text to Speech TTS,3.0,175.0,Books,17-10-2013,Free
8,ACK Comics,4.0,460.0,Books,20-12-2012,Free
9,Learn Biology and Human Body Anatomy by GoLear...,3.5,363.0,Books,25-11-2013,Free


### Lakukan ***one hot encoding*** pada dataset yang kalian miliki

In [34]:
temp_dataset = pd.get_dummies(df[['Category']])
temp_dataset = pd.DataFrame(temp_dataset)

df.drop('Category', axis=1, inplace=True)
df = df.join(temp_dataset)
df.head(10)

KeyError: "None of [Index(['Category'], dtype='object')] are in the [columns]"