## Data Processing in Python

### Importing the libraries

In [None]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [None]:
dataset = pd.read_csv("Data.csv")

#independent variables (all but the rightmost column)
X = dataset.iloc[:, :-1].values
# dependent variable (variable to be predicted; rightmost column)
y = dataset.iloc[:, -1].values

In [None]:
print(X)

In [None]:
print(y)

### Taking care of missing data

In [None]:

from sklearn.impute import SimpleImputer
# creating and instance that takes the mean salary to fill in the missing values
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer.fit(X[:, 1:3])
X[:, 1:3] = imputer.transform(X[:, 1:3])


In [None]:
print(X)

### Encoding categorical data

In [None]:
# (note: can't assign countries as numbers because the ML could interpret them as having an order)
# assign them vectors like [0,0,1] and [0,1,0]

# encoding the independent variable (the country)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])], remainder="passthrough")
X = np.array(ct.fit_transform(X))

In [None]:
print(X)

### Encoding the dependent variable (the yes/no)

In [None]:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
print(y)

### Splitting the dataset into the Training set and the Test set

In [None]:

# Feature scaling must be done after the split!
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
print(X_train)

In [None]:
print(X_test)

In [None]:
print(y_train)

In [None]:
print(y_test)

### Feature Scaling

In [None]:

# Standardization results in scores between +3 and -3, normalization is betwen 0 and 1
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Feature scaling should not be applied to the vectors and labels that strings were converted into. Besides, they are already within the scaled range
X_train[:, 3:] = sc.fit_transform(X_train[:, 3:]) #OneHotEncoder usually means the first three columns are vectors.
# Training data and test data must be used the same fit, so don't run fit again
X_test[:, 3:] = sc.transform(X_test[:, 3:])


In [None]:
print(X_train)

In [None]:
print(X_test)