# Data Preprocessing Tools

## Importing the libraries

In [9]:
# This is where we are importing all the packages we need
# python uses the keyword "as" in order to rename package names
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [17]:
# use panda to import our data
dataset = pd.read_csv('Data.csv')

# grab the data for each country
#
# a colon means we are grabbing a range, if no value is
# on the left side it grabs the first index, and if no
# value is on the right side it grabs the last. The range
# includes the first number but excludes the last (1:9 would
# grab indexes 1-8)
country_data = dataset.iloc[:, :-1].values
answers = dataset.iloc[:, -1].values

In [18]:
# check to make sure everything is right
print(country_data)

[['France' 44.0 71000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 nan]
 ['France' 35.0 58000.0]
 ['Spain' nan 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


In [19]:
print(answers)

['No' 'Yes' 'No' 'No' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes']


## Taking care of missing data

In [20]:
# you can import modules halfway through the file in python i guess?
from sklearn.impute import SimpleImputer

# to initialize a class, just call the class name with parentheses
# (first param: what we want to replace, second param: what we're replacing with)
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

# specify which rows and columns to parse through
imputer.fit(country_data[:, 1:])

# transform the data. this method returns only the effected rows and columns
# so make sure to save it to a variable
country_data[:, 1:] = imputer.transform(country_data[:, 1:])

In [21]:
print(country_data)

[['France' 44.0 71000.0]
 ['Spain' 27.0 48000.0]
 ['Germany' 30.0 54000.0]
 ['Spain' 38.0 61000.0]
 ['Germany' 40.0 63666.666666666664]
 ['France' 35.0 58000.0]
 ['Spain' 38.77777777777778 52000.0]
 ['France' 48.0 79000.0]
 ['Germany' 50.0 83000.0]
 ['France' 37.0 67000.0]]


## Encoding categorical data

### Encoding the Independent Variable

In [22]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# initializing the class ComumnTransformer into the object ct
# the first argument is transformers which is what kind of transformation (encoding),
# what kind of encoding (one hot encoding), and the indexes we are encoding
#
# the second argument means to keep all of the remainding data without modifying it
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0])], remainder='passthrough')

# you can reassign python variables
country_data = np.array(ct.fit_transform(country_data))

In [23]:
# France = [1, 0, 0]
# Spain = [0, 0, 1]
# Germany = [0, 1, 0]

print(country_data)

[[1.0 0.0 0.0 44.0 71000.0]
 [0.0 0.0 1.0 27.0 48000.0]
 [0.0 1.0 0.0 30.0 54000.0]
 [0.0 0.0 1.0 38.0 61000.0]
 [0.0 1.0 0.0 40.0 63666.666666666664]
 [1.0 0.0 0.0 35.0 58000.0]
 [0.0 0.0 1.0 38.77777777777778 52000.0]
 [1.0 0.0 0.0 48.0 79000.0]
 [0.0 1.0 0.0 50.0 83000.0]
 [1.0 0.0 0.0 37.0 67000.0]]


### Encoding the Dependent Variable

In [24]:
from sklearn.preprocessing import LabelEncoder

# no need for params because we want to encode everything we pass in to the object
le = LabelEncoder()
answers = le.fit_transform(answers)

In [25]:
# no = 0
# yes = 1
print(answers)

[0 1 0 0 1 1 0 1 0 1]


## Splitting the dataset into the Training set and Test set

## Feature Scaling