# Census without Modin and Intel® oneAPI Data Analytics Library (oneDAL) Accelerated Scikit-Learn 

In [1]:
# =============================================================
# Copyright © 2020 Intel Corporation
# 
# SPDX-License-Identifier: MIT
# =============================================================

In this example we will be building an end to end machine learning workload with US census from 1970 to 2010.
It uses pandas for ETL, and uses Ridge Regression from stock scikit-learn library
to train and predict the US total income with education information.

Let's start by downloading census data to your local disk.

In [2]:
!wget https://storage.googleapis.com/intel-optimized-tensorflow/datasets/ipums_education2income_1970-2010.csv.gz

--2021-04-20 13:17:40--  https://storage.googleapis.com/intel-optimized-tensorflow/datasets/ipums_education2income_1970-2010.csv.gz
Resolving proxy-iind.intel.com (proxy-iind.intel.com)... 10.224.224.80
Connecting to proxy-iind.intel.com (proxy-iind.intel.com)|10.224.224.80|:912... connected.
Proxy request sent, awaiting response... 200 OK
Length: 379658563 (362M) [text/csv]
Saving to: ‘ipums_education2income_1970-2010.csv.gz.6’


2021-04-20 13:17:48 (52.9 MB/s) - ‘ipums_education2income_1970-2010.csv.gz.6’ saved [379658563/379658563]



### Without DAAL accelerated Scikit-Learn using Stock Python kernal

Import required modules

In [1]:
import os
import numpy as np
import pandas as pd

from sklearn import config_context
from sklearn.metrics import mean_squared_error, r2_score

Load sklearn and import packages from sklearn

In [2]:
import sklearn
from sklearn.model_selection import train_test_split
import sklearn.linear_model as lm2

Read the data from the downloaded archive file

In [3]:
df = pd.read_csv('ipums_education2income_1970-2010.csv.gz', compression="gzip")

ETL

In [4]:
# clean up features
keep_cols = [
    "YEAR", "DATANUM", "SERIAL", "CBSERIAL", "HHWT",
    "CPI99", "GQ", "PERNUM", "SEX", "AGE",
    "INCTOT", "EDUC", "EDUCD", "EDUC_HEAD", "EDUC_POP",
    "EDUC_MOM", "EDUCD_MOM2", "EDUCD_POP2", "INCTOT_MOM", "INCTOT_POP",
    "INCTOT_MOM2", "INCTOT_POP2", "INCTOT_HEAD", "SEX_HEAD",
]
df = df[keep_cols]

# clean up samples with invalid income, education, etc.
df = df.query("INCTOT != 9999999")
df = df.query("EDUC != -1")
df = df.query("EDUCD != -1")

# normalize income for inflation
df["INCTOT"] = df["INCTOT"] * df["CPI99"]

for column in keep_cols:
    df[column] = df[column].fillna(-1)
    df[column] = df[column].astype("float64")

y = df["INCTOT"]
X = df.drop(columns=["INCTOT", "CPI99"])

Train the model and predict the income

In [5]:
# ML - training and inference
import time
time_start = time.time()
clf = lm2.Ridge()

mse_values, cod_values = [], []
N_RUNS = 50
TRAIN_SIZE = 0.9
random_state = 777

X = np.ascontiguousarray(X, dtype=np.float64)
y = np.ascontiguousarray(y, dtype=np.float64)

# cross validation
for i in range(N_RUNS):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=TRAIN_SIZE,
                                                        random_state=random_state)
    random_state += 777

    # training
    with config_context(assume_finite=True):
        model = clf.fit(X_train, y_train)

    # inference
    y_pred = model.predict(X_test)

    mse_values.append(mean_squared_error(y_test, y_pred))
    cod_values.append(r2_score(y_test, y_pred))
time_nodaal = time.time()-time_start
time_start = time.time()        
print(time_nodaal)      
%store time_nodaal

358.6722595691681
Stored 'time_nodaal' (float)


In [6]:
mean_mse = sum(mse_values)/len(mse_values)
mean_cod = sum(cod_values)/len(cod_values)
mse_dev = pow(sum([(mse_value - mean_mse)**2 for mse_value in mse_values])/(len(mse_values) - 1), 0.5)
cod_dev = pow(sum([(cod_value - mean_cod)**2 for cod_value in cod_values])/(len(cod_values) - 1), 0.5)
print("mean MSE ± deviation: {:.9f} ± {:.9f}".format(mean_mse, mse_dev))
print("mean COD ± deviation: {:.9f} ± {:.9f}".format(mean_cod, cod_dev))


mean MSE ± deviation: 0.032564569 ± 0.000041799
mean COD ± deviation: 0.995367533 ± 0.000005869
