# 1. Missing Value
Sering kali, data rusak, atau hilang, kita perlu mengurusnya terlebih dahulu karena kedepannya data ini tidak berfungsi saat data hilang atau tidak lengkap.

## Imputing missing values dengan Imputer

In [14]:
import pandas as pd
from sklearn.preprocessing import Imputer

df = pd.read_csv('Data.csv')
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [4]:
# melihat jumlah missing value per kolom

df.isnull().sum()

Country      0
Age          1
Salary       1
Purchased    0
dtype: int64

In [5]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [7]:
# drop kolom spesifik yang mengandung NaN 
df.dropna(subset=['Age'])

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [8]:
df.iloc[:, 1:3]

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


In [15]:
# replace every occurrence of missing_values to one defined by strategy
# which can be mean, median, mode. Axis = 0 means rows, 1 means column
# replace with simpleImputer

imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis = 0)
df.iloc[:, 1:3] = imputer.fit_transform(df.iloc[:, 1:3])
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,No
1,Spain,27.0,48000.0,Yes
2,Germany,30.0,54000.0,No
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# 2. Encoding Data Kategori

In [16]:
# Label Encoder will replace every categorical variable with number. Useful for replacing yes by 1, no by 0.
# One Hot Encoder will create a separate column for every variable and give a value of 1 where the variable is present
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [21]:
lable_encoder = LabelEncoder()
temp = df.copy()
temp.iloc[:, 0] = lable_encoder.fit_transform(df.iloc[:, 0])
print(lable_encoder.classes_)
temp.iloc[:, 3] = lable_encoder.fit_transform(df.iloc[:, 3])
print(lable_encoder.classes_)
print(temp)

['France' 'Germany' 'Spain']
['No' 'Yes']
   Country   Age   Salary  Purchased
0        0  44.0  72000.0          0
1        2  27.0  48000.0          1
2        1  30.0  54000.0          0
3        2  38.0  61000.0          0
4        1  40.0  48000.0          1
5        0  35.0  58000.0          1
6        2  27.0  52000.0          0
7        0  48.0  79000.0          1
8        1  50.0  83000.0          0
9        0  37.0  67000.0          1


In [19]:
df.iloc[:, :-1]

Unnamed: 0,Country,Age,Salary
0,France,44.0,72000.0
1,Spain,27.0,48000.0
2,Germany,30.0,54000.0
3,Spain,38.0,61000.0
4,Germany,40.0,48000.0
5,France,35.0,58000.0
6,Spain,27.0,52000.0
7,France,48.0,79000.0
8,Germany,50.0,83000.0
9,France,37.0,67000.0


In [18]:
# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 0] = one_hot_encoder.fit_transform(df.iloc[:, :0])
# temp
# you can achieve the same thing using get_dummies
pd.get_dummies(df.iloc[:, :-1])

Unnamed: 0,Age,Salary,Country_France,Country_Germany,Country_Spain
0,44.0,72000.0,1,0,0
1,27.0,48000.0,0,0,1
2,30.0,54000.0,0,1,0
3,38.0,61000.0,0,0,1
4,40.0,48000.0,0,1,0
5,35.0,58000.0,1,0,0
6,27.0,52000.0,0,0,1
7,48.0,79000.0,1,0,0
8,50.0,83000.0,0,1,0
9,37.0,67000.0,1,0,0


In [27]:
# you can pass an array of indices of categorical features
# one_hot_encoder = OneHotEncoder(categorical_features=[0])
# temp = df.copy()
# temp.iloc[:, 1] = one_hot_encoder.fit_transform(df.iloc[:, :])
# temp

# 3. Binarizing
Mengubah Data menjadi 0 dan 1. Kita akan mencoba dataset lain, yaitu dataset iris yang ada pada library scikit-learn. (https://archive.ics.uci.edu/ml/datasets/iris)

In [28]:
from sklearn.datasets import load_iris

iris_dataset = load_iris()
X = iris_dataset.data
y = iris_dataset.target
feature_names = iris_dataset.feature_names
print(feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [32]:
X[:, 1]
# X[:, 1:2]

array([1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.])

Kita akan mengubah 0 jika dibawah rata-rata, dan 1 jika diatas rata-rata

In [30]:
from sklearn.preprocessing import Binarizer
binarizer_obj = Binarizer(threshold=X[:, 1].mean())
X[:, 1:2] = binarizer_obj.fit_transform(X[:, 1].reshape(-1, 1))
X[:, 1]

array([1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       0., 1., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       1., 1., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 1., 0.])

# 4. Fitur Scaling

In [33]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler

df = pd.read_csv('Data.csv').dropna()
print(df)
X = df[["Age", "Salary"]].values.astype(np.float64)
print(X)

   Country   Age   Salary Purchased
0   France  44.0  72000.0        No
1    Spain  27.0  48000.0       Yes
2  Germany  30.0  54000.0        No
3    Spain  38.0  61000.0        No
5   France  35.0  58000.0       Yes
7   France  48.0  79000.0       Yes
8  Germany  50.0  83000.0        No
9   France  37.0  67000.0       Yes
[[4.4e+01 7.2e+04]
 [2.7e+01 4.8e+04]
 [3.0e+01 5.4e+04]
 [3.8e+01 6.1e+04]
 [3.5e+01 5.8e+04]
 [4.8e+01 7.9e+04]
 [5.0e+01 8.3e+04]
 [3.7e+01 6.7e+04]]


In [34]:
standard_scaler = StandardScaler()
normalizer = Normalizer()
min_max_scaler = MinMaxScaler()

print("Standardization")
print(standard_scaler.fit_transform(X))

print("Normalizing")
print(normalizer.fit_transform(X))

print("MinMax Scaling")
print(min_max_scaler.fit_transform(X))

Standardization
[[ 0.69985807  0.58989097]
 [-1.51364653 -1.50749915]
 [-1.12302807 -0.98315162]
 [-0.08137885 -0.37141284]
 [-0.47199731 -0.6335866 ]
 [ 1.22068269  1.20162976]
 [ 1.48109499  1.55119478]
 [-0.211585    0.1529347 ]]
Normalizing
[[6.11110997e-04 9.99999813e-01]
 [5.62499911e-04 9.99999842e-01]
 [5.55555470e-04 9.99999846e-01]
 [6.22950699e-04 9.99999806e-01]
 [6.03448166e-04 9.99999818e-01]
 [6.07594825e-04 9.99999815e-01]
 [6.02409529e-04 9.99999819e-01]
 [5.52238722e-04 9.99999848e-01]]
MinMax Scaling
[[0.73913043 0.68571429]
 [0.         0.        ]
 [0.13043478 0.17142857]
 [0.47826087 0.37142857]
 [0.34782609 0.28571429]
 [0.91304348 0.88571429]
 [1.         1.        ]
 [0.43478261 0.54285714]]


# 5. Ekstraksi Fitur
Pada pertemuan sebelumnya kalian telah mencoba membuat program WordCount. WordCount merupakan sebuah teknik dalam melakukan ekstraksi Fitur. Namun, kalian tidak perlu membuat sendiri. Scikit-Learn telah menyediakan librarynya. Ekstraksi Fitur ini nantinya akan berguna dalam pemrosesan klasifikasi, clustering, maupun teknik pembelajaran mesin lainnya.

## 5.1 Count Vectorizer

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

docs = ["Mayur is a nice boy mayur.", "Mayur rock! wohooo!", "My name is Mayur, and I am a Pythonista!"]
cv = CountVectorizer()
X = cv.fit_transform(docs)
print(X)
print(cv.vocabulary_)
print(X.todense())

  (0, 2)	1
  (0, 7)	1
  (0, 3)	1
  (0, 4)	2
  (1, 10)	1
  (1, 9)	1
  (1, 4)	1
  (2, 8)	1
  (2, 0)	1
  (2, 1)	1
  (2, 6)	1
  (2, 5)	1
  (2, 3)	1
  (2, 4)	1
{'mayur': 4, 'is': 3, 'nice': 7, 'boy': 2, 'rock': 9, 'wohooo': 10, 'my': 5, 'name': 6, 'and': 1, 'am': 0, 'pythonista': 8}
[[0 0 1 1 2 0 0 1 0 0 0]
 [0 0 0 0 1 0 0 0 0 1 1]
 [1 1 0 1 1 1 1 0 1 0 0]]


## Dict Vectorizer
DictVectorizer melakukan mapping dari dictionry wordcount ke Vektor

In [38]:
from sklearn.feature_extraction import DictVectorizer

docs = [{"Aku": 1, "suka": 1, "makan": 2}, {"Aku": 1, "tidak": 1, "suka": 2, "makan": 3, "kambing": 1, "bakar": 2, "madu": 3}]
dv = DictVectorizer()
X = dv.fit_transform(docs)
print(X)
print(dv.vocabulary_)
print(X.todense())

  (0, 0)	1.0
  (0, 4)	2.0
  (0, 5)	1.0
  (1, 0)	1.0
  (1, 1)	2.0
  (1, 2)	1.0
  (1, 3)	3.0
  (1, 4)	3.0
  (1, 5)	2.0
  (1, 6)	1.0
{'Aku': 0, 'suka': 5, 'makan': 4, 'tidak': 6, 'kambing': 2, 'bakar': 1, 'madu': 3}
[[1. 0. 0. 0. 2. 1. 0.]
 [1. 2. 1. 3. 3. 2. 1.]]


## TfIdf Vectorizer:
Word Count (Term Frekuensi dikali dengan Inverse Dokumen Frekuensi),

Tutorial dapat dilihat pada link berikut: https://datascience.mipa.ugm.ac.id/id/representasi-teks-dalam-vektor-part-1/ https://datascience.mipa.ugm.ac.id/id/representasi-teks-dalam-vektor-part-2/

kata yang muncul di tiap dokumen nilai vektor kecil, yang jarang muncul di setiap dokumen, nilai vektor besar
untuk membandingkan/mencari dokumen yang berbeda


In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tfidf_vectorizer = TfidfVectorizer()
cv_vectorizer = CountVectorizer()
docs = ["Mayur is a Guitarist Guitarist", "Mayur is Cooker", "Mayur is Musician", "Mayur is also a programmer"]
X_idf = tfidf_vectorizer.fit_transform(docs)
X_cv = cv_vectorizer.fit_transform(docs)
print(X_idf.todense())
print(tfidf_vectorizer.vocabulary_)
print(X_cv.todense())

[[0.         0.         0.93816752 0.24478737 0.24478737 0.
  0.        ]
 [0.         0.8046125  0.         0.41988018 0.41988018 0.
  0.        ]
 [0.         0.         0.         0.41988018 0.41988018 0.8046125
  0.        ]
 [0.62688384 0.         0.         0.32713399 0.32713399 0.
  0.62688384]]
{'mayur': 4, 'is': 3, 'guitarist': 2, 'cooker': 1, 'musician': 5, 'also': 0, 'programmer': 6}
[[0 0 2 1 1 0 0]
 [0 1 0 1 1 0 0]
 [0 0 0 1 1 1 0]
 [1 0 0 1 1 0 1]]


In [48]:
import requests
import lxml.html as lh

gdelt_base_url = 'http://data.gdeltproject.org/events/'

# get the list of all the links on the gdelt file page
page = requests.get(gdelt_base_url+'index.html')
doc = lh.fromstring(page.content)
link_list = doc.xpath("//*/ul/li/a/@href")

# separate out those links that begin with four digits 
file_list = [x for x in link_list if x[0:6] == '201810']

file_list

['20181027.export.CSV.zip',
 '20181026.export.CSV.zip',
 '20181025.export.CSV.zip',
 '20181024.export.CSV.zip',
 '20181023.export.CSV.zip',
 '20181022.export.CSV.zip',
 '20181021.export.CSV.zip',
 '20181020.export.CSV.zip',
 '20181019.export.CSV.zip',
 '20181018.export.CSV.zip',
 '20181017.export.CSV.zip',
 '20181016.export.CSV.zip',
 '20181015.export.CSV.zip',
 '20181014.export.CSV.zip',
 '20181013.export.CSV.zip',
 '20181012.export.CSV.zip',
 '20181011.export.CSV.zip',
 '20181010.export.CSV.zip',
 '20181009.export.CSV.zip',
 '20181008.export.CSV.zip',
 '20181007.export.CSV.zip',
 '20181006.export.CSV.zip',
 '20181005.export.CSV.zip',
 '20181004.export.CSV.zip',
 '20181003.export.CSV.zip',
 '20181002.export.CSV.zip',
 '20181001.export.CSV.zip']

In [49]:
infilecounter = 0
outfilecounter = 0

# D:\digitalent\GDELT_Data

In [57]:
import os.path
import urllib.request
import zipfile
import glob
import operator

local_path = 'D:/digitalent/GDELT_Data/'

fips_country_code = 'ID'

for compressed_file in file_list[infilecounter:]:
    print(compressed_file)
    
    # if we dont have the compressed file stored locally, go get it. Keep trying if necessary.
    while not os.path.isfile(local_path+compressed_file): 
        print('downloading,')
        urllib.request.urlretrieve(url=gdelt_base_url+compressed_file, filename=local_path+compressed_file)
        
    # extract the contents of the compressed file to a temporary directory    
    print('extracting,')
    z = zipfile.ZipFile(file=local_path+compressed_file, mode='r')    
    z.extractall(path=local_path+'tmp/')
    
    # parse each of the csv files in the working directory, 
    print('parsing,')
    for infile_name in glob.glob(local_path+'tmp/*'):
        outfile_name = local_path+'country/'+fips_country_code+'%04i.tsv'%outfilecounter
        
        # open the infile and outfile
        with open(infile_name, mode='r') as infile, open(outfile_name, mode='w') as outfile:
            for line in infile:
                # extract lines with our interest country code
                if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')):    
                    outfile.write(line)
            outfilecounter +=1
            
        # delete the temporary file
        os.remove(infile_name)
    infilecounter +=1
    print('done')

20181027.export.CSV.zip
downloading,
extracting,
parsing,


UnicodeDecodeError: 'charmap' codec can't decode byte 0x90 in position 3748: character maps to <undefined>