In [1]:
import pandas as pd

In [2]:
adult_census = pd.read_csv("datasets/adult-census.csv")

In [3]:
adult_census = adult_census.drop(columns="education-num")

In [4]:
adult_census.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,Some-college,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [6]:
data, target = adult_census.drop(columns=["class"]), adult_census["class"]

In [7]:
data.head()

Unnamed: 0,age,workclass,education,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,25,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States
1,38,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States
2,28,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States
3,44,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States
4,18,?,Some-college,Never-married,?,Own-child,White,Female,0,0,30,United-States


In [8]:
target.head()

0     <=50K
1     <=50K
2      >50K
3      >50K
4     <=50K
Name: class, dtype: object

In [9]:
numerical_columns = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

In [10]:
data[numerical_columns].head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,25,0,0,40
1,38,0,0,50
2,28,0,0,40
3,44,7688,0,40
4,18,0,0,30


In [11]:
from sklearn.model_selection import train_test_split

In [14]:
data_numeric = data[numerical_columns]

In [15]:
data_train, data_test, target_train, target_test = train_test_split(
    data_numeric, target, random_state=42, test_size=0.25)

In [16]:
data_train.shape

(36631, 4)

In [17]:
data_test.shape

(12211, 4)

## Fitting a logistic regression model

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression()

In [21]:
model.fit(data_train, target_train)

In [22]:
accuracy = model.score(data_test, target_test)

In [23]:
accuracy

0.8070592089099992

## Scaling numerical features

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
scaler = StandardScaler()

In [26]:
scaler.fit(data_train)

In [27]:
data_train_scaled = scaler.transform(data_train)

# Machine learning pipeline

In [28]:
from sklearn.pipeline import make_pipeline

In [29]:
model = make_pipeline(StandardScaler(), LogisticRegression())

In [30]:
model

## Cross-validation

In [31]:
from sklearn.model_selection import cross_validate

In [32]:
cv_result = cross_validate(model, data_numeric, target, cv=5)

In [33]:
cv_result

{'fit_time': array([0.04013991, 0.0355022 , 0.03595114, 0.03469896, 0.03459096]),
 'score_time': array([0.0083909 , 0.00807095, 0.0081718 , 0.00822878, 0.00805402]),
 'test_score': array([0.79557785, 0.80049135, 0.79965192, 0.79873055, 0.80456593])}

### Exercise: Recap fitting a scikit-learn model on numerical data [Sven]
#### 1. Why do we need two sets: a train set and a test set?

- a) to train the model faster

- b) to validate the model on unseen data

- c) to improve the accuracy of the model

Select all answers that apply

#### 2. The generalization performance of a scikit-learn model can be evaluated by:

- a) calling fit to train the model on the training set, predict on the test set to get the predictions, and compute the score by passing the predictions and the true target values to some metric function
- b) calling fit to train the model on the training set and score to compute the score on the test set
- c) calling cross_validate by passing the model, the data and the target
- d) calling fit_transform on the data and then score to compute the score on the test set

Select all answers that apply

#### 3. When calling `cross_validate(estimator, X, y, cv=5)`, the following happens:

- a) X and y are internally split five times with non-overlapping test sets
- b) estimator.fit is called 5 times on the full X and y
- c) estimator.fit is called 5 times, each time on a different training set
- d) a Python dictionary is returned containing a key/value containing a NumPy array with 5 scores computed on the train sets
- e) a Python dictionary is returned containing a key/value containing a NumPy array with 5 scores computed on the test sets

Select all answers that apply

#### 4. (optional) Scaling
We define a 2-dimensional dataset represented graphically as follows:
![](https://i.imgur.com/muvSbI6.png)

Question

If we process the dataset using a StandardScaler with the default parameters, which of the following results do you expect:

![](https://i.imgur.com/t5mTlVG.png)


a) Preprocessing A
b) Preprocessing B
c) Preprocessing C
d) Preprocessing D

Select a single answer

#### 5. (optional) Cross-validation allows us to:

a) train the model faster
b) measure the generalization performance of the model
c) reach better generalization performance
d) estimate the variability of the generalization score

Select all answers that apply