# Day 1: Data Preprocessing
Updated to current scikit-learn APIs and runnable in VS Code.

## Setup in VS Code\n
\n
1) Install Python 3.x and ensure it is on PATH.\n
2) Create and activate a venv in the repo root:\n
\n
```powershell\n
python -m venv .venv\n
.\\.venv\\Scripts\\Activate.ps1\n
```\n
\n
3) Install dependencies in the same environment: \n
\n
```powershell\n
python -m pip install numpy pandas scikit-learn jupyter ipykernel\n
```\n
\n
Then select the `.venv` kernel in VS Code and run the cells.\n

## Step 1: Importing the libraries

In [9]:
import numpy as np
import pandas as pd


## Step 2: Importing dataset

In [10]:
from pathlib import Path

# Resolve datasets/Data.csv relative to the repo root
cwd = Path.cwd()
data_path = None
for parent in [cwd] + list(cwd.parents):
    candidate = parent / "datasets" / "Data.csv"
    if candidate.exists():
        data_path = candidate
        break
if data_path is None:
    raise FileNotFoundError("Could not find datasets/Data.csv. Run the notebook from the repo root.")

dataset = pd.read_csv(data_path)

X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 3].values
X, Y


(array([['France', 44.0, 72000.0],
        ['Spain', 27.0, 48000.0],
        ['Germany', 30.0, 54000.0],
        ['Spain', 38.0, 61000.0],
        ['Germany', 40.0, nan],
        ['France', 35.0, 58000.0],
        ['Spain', nan, 52000.0],
        ['France', 48.0, 79000.0],
        ['Germany', 50.0, 83000.0],
        ['France', 37.0, 67000.0]], dtype=object),
 array(['No', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes'],
       dtype=object))

## Step 3: Handling the missing data

In [11]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
X[:, 1:3] = imputer.fit_transform(X[:, 1:3])
X


array([['France', 44.0, 72000.0],
       ['Spain', 27.0, 48000.0],
       ['Germany', 30.0, 54000.0],
       ['Spain', 38.0, 61000.0],
       ['Germany', 40.0, 63777.77777777778],
       ['France', 35.0, 58000.0],
       ['Spain', 38.77777777777778, 52000.0],
       ['France', 48.0, 79000.0],
       ['Germany', 50.0, 83000.0],
       ['France', 37.0, 67000.0]], dtype=object)

## Step 4: Encoding categorical data

In [12]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer(transformers=[('country', OneHotEncoder(), [0])], remainder='passthrough')
X = ct.fit_transform(X)
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
X, Y


(array([[1.0, 0.0, 0.0, 44.0, 72000.0],
        [0.0, 0.0, 1.0, 27.0, 48000.0],
        [0.0, 1.0, 0.0, 30.0, 54000.0],
        [0.0, 0.0, 1.0, 38.0, 61000.0],
        [0.0, 1.0, 0.0, 40.0, 63777.77777777778],
        [1.0, 0.0, 0.0, 35.0, 58000.0],
        [0.0, 0.0, 1.0, 38.77777777777778, 52000.0],
        [1.0, 0.0, 0.0, 48.0, 79000.0],
        [0.0, 1.0, 0.0, 50.0, 83000.0],
        [1.0, 0.0, 0.0, 37.0, 67000.0]], dtype=object),
 array([0, 1, 0, 0, 1, 1, 0, 1, 0, 1]))

## Step 5: Splitting the datasets into training set and test set

In [13]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.2, random_state=0
)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape


((8, 5), (2, 5), (8,), (2,))

## Step 6: Feature Scaling

In [14]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler(with_mean=False)
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
X_train, X_test


(array([[0.        , 3.02371578, 0.        , 6.88758737, 6.68891171],
        [2.        , 0.        , 0.        , 6.37101832, 7.02685324],
        [0.        , 0.        , 2.06559112, 4.64912147, 5.03416351],
        [0.        , 0.        , 2.06559112, 6.67713331, 5.45367714],
        [2.        , 0.        , 0.        , 8.26510484, 8.28539412],
        [0.        , 0.        , 2.06559112, 6.543208  , 6.3975828 ],
        [2.        , 0.        , 0.        , 7.57634611, 7.55124527],
        [2.        , 0.        , 0.        , 6.02663895, 6.08294758]]),
 array([[0.        , 3.02371578, 0.        , 5.16569053, 5.66343395],
        [0.        , 3.02371578, 0.        , 8.60948421, 8.70490774]]))