# Installations

In [None]:
!pip install eurostatapiclient

Collecting eurostatapiclient
  Downloading eurostatapiclient-0.2.7-py3-none-any.whl (12 kB)
Installing collected packages: eurostatapiclient
Successfully installed eurostatapiclient-0.2.7


# Libraries

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import scipy
import pandas as pd

import os
from requests import get
from eurostatapiclient import EurostatAPIClient

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

# Data

In [None]:
import sys
IN_COLAB = 'google.colab' in sys.modules
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/gdrive', force_remount=True)
  dir = os.path.join('gdrive', 'My Drive', 'Eurostat', '02 - Data Science for Structured Data')
  os.system('cd "gdrive${dir}" && mkdir -p data')
else:
  dir = "." 

data_dir = os.path.join(dir, 'data')
os.makedirs(data_dir, exist_ok=True)

True

In [None]:
def download_save(url, filename):
  res = get(url)
  if res.status_code != 200:
    print(f"Couldn't fetch data from {url}")
  else:
    csv_file = open(os.path.join(data_dir, filename), 'wb')
    csv_file.write(res.content)
    csv_file.close()

## Iris - Flower Classification

In [None]:
%%html
<iframe src="https://archive.ics.uci.edu/ml/datasets/iris" width="1100" height="600"></iframe>

In [None]:
download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
              'iris.csv')

In [None]:
df_iris = pd.read_csv(os.path.join(data_dir, 'iris.csv'),
                      header=None,
                      names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
df_iris.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
121,5.6,2.8,4.9,2.0,Iris-virginica
143,6.8,3.2,5.9,2.3,Iris-virginica
91,6.1,3.0,4.6,1.4,Iris-versicolor
122,7.7,2.8,6.7,2.0,Iris-virginica
53,5.5,2.3,4.0,1.3,Iris-versicolor
30,4.8,3.1,1.6,0.2,Iris-setosa
101,5.8,2.7,5.1,1.9,Iris-virginica
45,4.8,3.0,1.4,0.3,Iris-setosa
108,6.7,2.5,5.8,1.8,Iris-virginica
56,6.3,3.3,4.7,1.6,Iris-versicolor


## Pima Indians - Diabetes Classification

https://www.kaggle.com/uciml/pima-indians-diabetes-database

In [None]:
download_save('https://gist.githubusercontent.com/ktisha/c21e73a1bd1700294ef790c56c8aec1f/raw/819b69b5736821ccee93d05b51de0510bea00294/pima-indians-diabetes.csv',
              'pima.csv')

In [None]:
df_pima = pd.read_csv(os.path.join(data_dir, 'pima.csv'),
                      header=8,
                      names = ['preg', 'gluc', 'pres', 'skin' ,'insu', 'bmi', 'pedi', 'age', 'class'])
df_pima.sample(10)

Unnamed: 0,preg,gluc,pres,skin,insu,bmi,pedi,age,class
722,1,149,68,29,127,29.3,0.349,42,1
563,6,99,60,19,54,26.9,0.497,32,0
595,0,188,82,14,185,32.0,0.682,22,1
364,4,147,74,25,293,34.9,0.385,30,0
700,2,122,76,27,200,35.9,0.483,26,0
289,5,108,72,43,75,36.1,0.263,33,0
81,2,74,0,0,0,0.0,0.102,22,0
369,1,133,102,28,140,32.8,0.234,45,1
20,3,126,88,41,235,39.3,0.704,27,0
166,3,148,66,25,0,32.5,0.256,22,0


## Wine - Quality Regression

In [None]:
%%html
<iframe src="https://archive.ics.uci.edu/ml/datasets/wine+quality" width="1100" height="600"></iframe>

In [None]:
download_save('https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv',
              'wine.csv')

In [None]:
df_wine = pd.read_csv(os.path.join(data_dir, 'wine.csv'), sep=';')
df_wine.sample(10)

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
785,9.9,0.35,0.41,2.3,0.083,11.0,61.0,0.9982,3.21,0.5,9.5,5
362,11.9,0.58,0.66,2.5,0.072,6.0,37.0,0.9992,3.05,0.56,10.0,5
994,10.0,0.35,0.45,2.5,0.092,20.0,88.0,0.99918,3.15,0.43,9.4,5
1159,10.2,0.41,0.43,2.2,0.11,11.0,37.0,0.99728,3.16,0.67,10.8,5
649,6.7,0.42,0.27,8.6,0.068,24.0,148.0,0.9948,3.16,0.57,11.3,6
78,6.7,0.75,0.12,2.0,0.086,12.0,80.0,0.9958,3.38,0.52,10.1,5
212,11.6,0.44,0.64,2.1,0.059,5.0,15.0,0.998,3.21,0.67,10.2,6
1233,10.2,0.23,0.37,2.2,0.057,14.0,36.0,0.99614,3.23,0.49,9.3,4
619,11.3,0.37,0.41,2.3,0.088,6.0,16.0,0.9988,3.09,0.8,9.3,5
348,9.6,0.56,0.31,2.8,0.089,15.0,46.0,0.9979,3.11,0.92,10.0,6


## Housing - Price Regression

In [None]:
%%html
<iframe src="https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html" width="1100" height="600"></iframe>

In [None]:
download_save('https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
              'housing.csv')

In [None]:
df_housing = pd.read_csv(os.path.join(data_dir, 'housing.csv'))
df_housing.sample(10)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
441,9.72418,0.0,18.1,0,0.74,6.406,97.2,2.0651,24,666,20.2,385.96,19.52,17.1
464,7.83932,0.0,18.1,0,0.655,6.209,65.4,2.9634,24,666,20.2,396.9,13.22,21.4
120,0.06899,0.0,25.65,0,0.581,5.87,69.7,2.2577,2,188,19.1,389.15,14.37,22.0
466,3.77498,0.0,18.1,0,0.655,5.952,84.7,2.8715,24,666,20.2,22.01,17.15,19.0
271,0.16211,20.0,6.96,0,0.464,6.24,16.3,4.429,3,223,18.6,396.9,6.59,25.2
336,0.03427,0.0,5.19,0,0.515,5.869,46.3,5.2311,5,224,20.2,396.9,9.8,19.5
269,0.09065,20.0,6.96,1,0.464,5.92,61.5,3.9175,3,223,18.6,391.34,13.65,20.7
161,1.46336,0.0,19.58,0,0.605,7.489,90.8,1.9709,5,403,14.7,374.43,1.73,50.0
314,0.3692,0.0,9.9,0,0.544,6.567,87.3,3.6023,4,304,18.4,395.69,9.28,23.8
174,0.08447,0.0,4.05,0,0.51,5.859,68.7,2.7019,5,296,16.6,393.23,9.64,22.6


## Happiness - Satisfaction Regression

In [None]:
%%html
<iframe src="https://ec.europa.eu/eurostat/databrowser/view/ilc_pw01$DV_528/default/table?lang=en" width="1100" height="600"></iframe>

In [None]:
%%html
<iframe src="https://ec.europa.eu/eurostat/statistics-explained/index.php?title=EU_statistics_on_income_and_living_conditions_(EU-SILC)_methodology_-_2013_personal_well-being_indicators" width="1100" height="600"></iframe>

In [None]:
client = EurostatAPIClient('v2.1', 'json', 'en')

df = []
for sat_cat in ['ACCSAT', 'COMSAT', 'FINSAT', 'GREENSAT', 'JOBSAT', 'LIFESAT', 'LIVENVSAT', 'MEANLIFE', 'RELSAT', 'TIMESAT']:

  par_df = {
      'geo': ['AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL', 'ES', 'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV', 'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI', 'SK', 'TR', 'UK', 'AL', 'XK'],
      'indic_wb': sat_cat,
      'isced11': ['ED0-2', 'ED3_4', 'ED5-8', 'ED5_6'],
      'age': ['Y16-24', 'Y16-24', 'Y25-34', 'Y35-49', 'Y50-64', 'Y65-74'],
      'unit': 'RTG',
      'sex': ['F', 'M'],
      'time': ['2013', '2018'],
      'precision': 1}

  print(sat_cat)
  dfi = client.get_dataset('ilc_pw01', params=par_df).to_dataframe()
  df.append(dfi)

df = pd.concat(df, axis=0, ignore_index=True)
df = df.pivot(values='values', columns='indic_wb', index=df.columns.drop(['indic_wb', 'values', 'unit'])).reset_index()
df.dropna(subset=['LIFESAT'], axis=0, inplace=True)
df.to_csv(os.path.join(data_dir, 'happiness.csv'), index=False)

ACCSAT
COMSAT
FINSAT
GREENSAT
JOBSAT
LIFESAT
LIVENVSAT
MEANLIFE
RELSAT
TIMESAT


In [None]:
df_happiness = pd.read_csv(os.path.join(data_dir, 'happiness.csv'))
df_happiness.sample(10)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
1228,ED3_4,M,Y35-49,IE,2018,,,6.4,,7.7,7.8,,,8.5,6.8
1175,ED3_4,M,Y25-34,NL,2018,,,6.7,,7.4,7.5,,,7.8,6.8
1742,ED5-8,F,Y65-74,SK,2013,8.2,,6.2,6.9,,7.0,7.0,7.6,7.9,8.1
1706,ED5-8,F,Y65-74,HU,2013,7.6,,6.4,6.3,,6.5,6.6,7.5,7.8,7.2
1777,ED5-8,M,Y16-24,PL,2013,7.8,7.5,6.9,7.9,7.7,8.3,7.4,8.2,8.5,7.1
426,ED0-2,M,Y25-34,BG,2013,4.7,5.1,2.5,4.2,4.9,4.0,4.3,5.2,5.1,5.1
1106,ED3_4,M,Y16-24,NO,2018,,,7.4,,8.6,8.1,,,8.8,7.7
911,ED3_4,F,Y35-49,UK,2018,,,6.3,,7.1,7.4,,,8.4,6.1
523,ED0-2,M,Y35-49,LT,2013,6.8,7.5,4.4,7.0,6.9,5.8,7.4,6.2,7.2,6.3
1149,ED3_4,M,Y25-34,FI,2018,,,7.0,,8.0,7.9,,,8.2,7.3


# Missing Values

In [None]:
df_happiness

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
0,ED0-2,F,Y16-24,AL,2018,,,4.9,,5.6,6.1,,,7.2,5.9
1,ED0-2,F,Y16-24,AT,2013,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2
2,ED0-2,F,Y16-24,AT,2018,,,7.5,,8.1,8.0,,,8.9,7.3
3,ED0-2,F,Y16-24,BE,2013,7.8,,6.7,6.7,,7.7,7.4,8.0,8.1,7.7
4,ED0-2,F,Y16-24,BE,2018,,,6.8,,,7.8,,,8.0,7.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,ED5_6,M,Y65-74,MT,2013,8.2,,7.5,6.7,,7.9,7.1,8.4,8.9,7.6
2141,ED5_6,M,Y65-74,RO,2013,8.0,,7.1,7.4,,7.5,7.3,7.6,7.9,7.9
2142,ED5_6,M,Y65-74,SE,2013,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6
2143,ED5_6,M,Y65-74,TR,2013,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5


## Detect

In [None]:
df_happiness.isnull().sum()

isced11         0
sex             0
age             0
geo             0
time            0
ACCSAT       1084
COMSAT       1336
FINSAT          0
GREENSAT     1085
JOBSAT        464
LIFESAT         0
LIVENVSAT    1084
MEANLIFE     1085
RELSAT          0
TIMESAT         1
dtype: int64

In [None]:
df_happiness.isnull().sum() / len(df_happiness) * 100

isced11       0.000000
sex           0.000000
age           0.000000
geo           0.000000
time          0.000000
ACCSAT       50.536131
COMSAT       62.284382
FINSAT        0.000000
GREENSAT     50.582751
JOBSAT       21.631702
LIFESAT       0.000000
LIVENVSAT    50.536131
MEANLIFE     50.582751
RELSAT        0.000000
TIMESAT       0.046620
dtype: float64

## Drop

In [None]:
df_happiness.dropna(axis=0)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
1,ED0-2,F,Y16-24,AT,2013,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2
12,ED0-2,F,Y16-24,DE,2013,7.6,6.8,6.3,7.2,7.2,7.4,7.2,7.2,8.0,6.5
16,ED0-2,F,Y16-24,EE,2013,7.7,8.0,6.0,6.9,7.8,7.3,7.1,7.7,8.0,7.0
20,ED0-2,F,Y16-24,ES,2013,7.1,7.3,5.7,6.5,6.9,7.3,7.0,7.8,8.0,7.2
28,ED0-2,F,Y16-24,HU,2013,6.4,6.6,4.9,5.9,6.6,7.0,5.9,7.5,8.1,6.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2136,ED5_6,M,Y50-64,UK,2013,8.1,7.4,6.8,8.0,7.2,7.4,7.9,7.8,8.5,6.8
2139,ED5_6,M,Y65-74,IT,2013,7.7,7.3,6.8,6.3,7.6,7.2,6.5,7.6,7.5,7.4
2142,ED5_6,M,Y65-74,SE,2013,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6
2143,ED5_6,M,Y65-74,TR,2013,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5


## Impute

### with constant

In [None]:
df_happiness.fillna(0)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
0,ED0-2,F,Y16-24,AL,2018,0.0,0.0,4.9,0.0,5.6,6.1,0.0,0.0,7.2,5.9
1,ED0-2,F,Y16-24,AT,2013,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2
2,ED0-2,F,Y16-24,AT,2018,0.0,0.0,7.5,0.0,8.1,8.0,0.0,0.0,8.9,7.3
3,ED0-2,F,Y16-24,BE,2013,7.8,0.0,6.7,6.7,0.0,7.7,7.4,8.0,8.1,7.7
4,ED0-2,F,Y16-24,BE,2018,0.0,0.0,6.8,0.0,0.0,7.8,0.0,0.0,8.0,7.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,ED5_6,M,Y65-74,MT,2013,8.2,0.0,7.5,6.7,0.0,7.9,7.1,8.4,8.9,7.6
2141,ED5_6,M,Y65-74,RO,2013,8.0,0.0,7.1,7.4,0.0,7.5,7.3,7.6,7.9,7.9
2142,ED5_6,M,Y65-74,SE,2013,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6
2143,ED5_6,M,Y65-74,TR,2013,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5


In [None]:
help(df_happiness.fillna)

Help on method fillna in module pandas.core.frame:

fillna(value=None, method=None, axis=None, inplace=False, limit=None, downcast=None) -> Union[ForwardRef('DataFrame'), NoneType] method of pandas.core.frame.DataFrame instance
    Fill NA/NaN values using the specified method.
    
    Parameters
    ----------
    value : scalar, dict, Series, or DataFrame
        Value to use to fill holes (e.g. 0), alternately a
        dict/Series/DataFrame of values specifying which value to use for
        each index (for a Series) or column (for a DataFrame).  Values not
        in the dict/Series/DataFrame will not be filled. This value cannot
        be a list.
    method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
        Method to use for filling holes in reindexed Series
        pad / ffill: propagate last valid observation forward to next valid
        backfill / bfill: use next valid observation to fill gap.
    axis : {0 or 'index', 1 or 'columns'}
        Axis along whi

### with preceding

In [None]:
df_happiness.fillna(method='ffill')

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
0,ED0-2,F,Y16-24,AL,2018,,,4.9,,5.6,6.1,,,7.2,5.9
1,ED0-2,F,Y16-24,AT,2013,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2
2,ED0-2,F,Y16-24,AT,2018,8.6,8.4,7.5,8.1,8.1,8.0,8.2,7.8,8.9,7.3
3,ED0-2,F,Y16-24,BE,2013,7.8,8.4,6.7,6.7,8.1,7.7,7.4,8.0,8.1,7.7
4,ED0-2,F,Y16-24,BE,2018,7.8,8.4,6.8,6.7,8.1,7.8,7.4,8.0,8.0,7.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,ED5_6,M,Y65-74,MT,2013,8.2,7.3,7.5,6.7,7.6,7.9,7.1,8.4,8.9,7.6
2141,ED5_6,M,Y65-74,RO,2013,8.0,7.3,7.1,7.4,7.6,7.5,7.3,7.6,7.9,7.9
2142,ED5_6,M,Y65-74,SE,2013,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6
2143,ED5_6,M,Y65-74,TR,2013,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5


### with average

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
help(SimpleImputer)

Help on class SimpleImputer in module sklearn.impute._base:

class SimpleImputer(_BaseImputer)
 |  SimpleImputer(missing_values=nan, strategy='mean', fill_value=None, verbose=0, copy=True, add_indicator=False)
 |  
 |  Imputation transformer for completing missing values.
 |  
 |  Read more in the :ref:`User Guide <impute>`.
 |  
 |  Parameters
 |  ----------
 |  missing_values : number, string, np.nan (default) or None
 |      The placeholder for the missing values. All occurrences of
 |      `missing_values` will be imputed.
 |  
 |  strategy : string, default='mean'
 |      The imputation strategy.
 |  
 |      - If "mean", then replace missing values using the mean along
 |        each column. Can only be used with numeric data.
 |      - If "median", then replace missing values using the median along
 |        each column. Can only be used with numeric data.
 |      - If "most_frequent", then replace missing using the most frequent
 |        value along each column. Can be used wi

In [None]:
columns_with_missing_values = [col for col in df_happiness.columns if df_happiness[col].isnull().sum()]
columns_with_missing_values

['ACCSAT', 'COMSAT', 'GREENSAT', 'JOBSAT', 'LIVENVSAT', 'MEANLIFE', 'TIMESAT']

In [None]:
df_imputed = df_happiness.copy()

my_imputer = SimpleImputer(strategy='mean')
df_imputed[columns_with_missing_values] = my_imputer.fit_transform(df_imputed[columns_with_missing_values].values)

df_imputed.sample(10)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
782,ED3_4,F,Y25-34,CZ,2018,7.406315,7.372188,6.7,7.034811,7.4,7.9,7.119887,7.522264,8.5,6.0
2077,ED5_6,F,Y25-34,IE,2013,7.6,8.2,5.5,7.4,7.2,7.7,7.9,7.8,8.9,6.3
1433,ED5-8,F,Y16-24,HU,2018,7.406315,7.372188,5.8,7.034811,7.204045,7.5,7.119887,7.522264,7.9,7.8
1192,ED3_4,M,Y25-34,TR,2013,6.0,6.7,5.2,5.8,6.4,5.8,6.3,6.6,7.6,5.7
2082,ED5_6,F,Y25-34,TR,2013,6.4,6.8,5.9,5.7,6.7,6.4,6.5,6.9,7.6,5.6
2136,ED5_6,M,Y50-64,UK,2013,8.1,7.4,6.8,8.0,7.2,7.4,7.9,7.8,8.5,6.8
679,ED0-2,M,Y65-74,NO,2013,8.6,7.372188,7.9,9.0,7.204045,8.2,8.9,8.3,8.8,8.6
1267,ED3_4,M,Y35-49,XK,2018,7.406315,7.372188,5.2,7.034811,6.7,6.0,7.119887,7.522264,8.6,6.1
667,ED0-2,M,Y65-74,LU,2013,7.6,7.372188,6.7,7.8,7.204045,6.8,7.8,7.8,7.6,7.7
1183,ED3_4,M,Y25-34,RO,2018,7.406315,7.372188,6.8,7.034811,7.2,7.7,7.119887,7.522264,8.0,7.0


In [None]:
df_happiness

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
0,ED0-2,F,Y16-24,AL,2018,,,4.9,,5.6,6.1,,,7.2,5.9
1,ED0-2,F,Y16-24,AT,2013,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2
2,ED0-2,F,Y16-24,AT,2018,,,7.5,,8.1,8.0,,,8.9,7.3
3,ED0-2,F,Y16-24,BE,2013,7.8,,6.7,6.7,,7.7,7.4,8.0,8.1,7.7
4,ED0-2,F,Y16-24,BE,2018,,,6.8,,,7.8,,,8.0,7.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,ED5_6,M,Y65-74,MT,2013,8.2,,7.5,6.7,,7.9,7.1,8.4,8.9,7.6
2141,ED5_6,M,Y65-74,RO,2013,8.0,,7.1,7.4,,7.5,7.3,7.6,7.9,7.9
2142,ED5_6,M,Y65-74,SE,2013,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6
2143,ED5_6,M,Y65-74,TR,2013,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5


In [None]:
df_imputed = df_happiness.copy()

for col in columns_with_missing_values:
  df_imputed[f'{col}_nan'] = df_happiness[col].isna()

my_imputer = SimpleImputer(strategy='mean')
df_imputed[columns_with_missing_values] = my_imputer.fit_transform(df_imputed[columns_with_missing_values].values)

df_imputed.sample(10)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT,ACCSAT_nan,COMSAT_nan,GREENSAT_nan,JOBSAT_nan,LIVENVSAT_nan,MEANLIFE_nan,TIMESAT_nan
1757,ED5-8,M,Y16-24,EL,2013,6.6,7.372188,3.3,4.9,7.204045,6.9,5.1,7.1,7.6,6.2,False,True,False,True,False,False,False
893,ED3_4,F,Y35-49,NO,2018,7.406315,7.372188,7.1,7.034811,8.0,7.7,7.119887,7.522264,8.3,6.7,True,True,True,False,True,True,False
1206,ED3_4,M,Y35-49,CY,2018,7.406315,7.372188,5.8,7.034811,7.3,7.0,7.119887,7.522264,8.5,5.8,True,True,True,False,True,True,False
1414,ED5-8,F,Y16-24,BE,2018,7.406315,7.372188,7.5,7.034811,7.8,7.9,7.119887,7.522264,8.3,7.4,True,True,True,False,True,True,False
1832,ED5-8,M,Y25-34,MK,2013,6.6,6.6,5.9,6.8,7.0,6.3,6.7,7.9,8.2,7.3,False,False,False,False,False,False,False
1098,ED3_4,M,Y16-24,ME,2018,7.406315,7.372188,6.3,7.034811,7.1,7.5,7.119887,7.522264,8.5,7.3,True,True,True,False,True,True,False
345,ED0-2,F,Y65-74,SK,2018,7.406315,7.372188,4.8,7.034811,7.204045,5.9,7.119887,7.522264,8.1,7.1,True,True,True,True,True,True,False
1264,ED3_4,M,Y35-49,TR,2018,7.406315,7.372188,4.9,7.034811,6.5,5.7,7.119887,7.522264,6.8,4.9,True,True,True,False,True,True,False
2075,ED5_6,F,Y16-24,UK,2013,7.5,7.8,6.4,7.4,6.8,7.6,7.9,7.7,8.4,7.1,False,False,False,False,False,False,False
42,ED0-2,F,Y16-24,ME,2013,7.2,7.372188,5.4,6.8,7.204045,7.1,6.2,7.1,6.4,7.5,False,True,False,True,False,False,False


# Categorical Variables

In [None]:
df_happiness.nunique()

isced11       4
sex           2
age           5
geo          37
time          2
ACCSAT       49
COMSAT       46
FINSAT       66
GREENSAT     54
JOBSAT       52
LIFESAT      57
LIVENVSAT    51
MEANLIFE     41
RELSAT       47
TIMESAT      50
dtype: int64

In [None]:
categorical_columns = [col for col in df_happiness.columns if df_happiness[col].nunique() < 40]
categorical_columns

['isced11', 'sex', 'age', 'geo', 'time']

## Ordinal Encoding

In [None]:
from sklearn.preprocessing import OrdinalEncoder

In [None]:
help(OrdinalEncoder)

Help on class OrdinalEncoder in module sklearn.preprocessing._encoders:

class OrdinalEncoder(_BaseEncoder)
 |  OrdinalEncoder(categories='auto', dtype=<class 'numpy.float64'>)
 |  
 |  Encode categorical features as an integer array.
 |  
 |  The input to this transformer should be an array-like of integers or
 |  strings, denoting the values taken on by categorical (discrete) features.
 |  The features are converted to ordinal integers. This results in
 |  a single column of integers (0 to n_categories - 1) per feature.
 |  
 |  Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
 |  
 |  .. versionchanged:: 0.20.1
 |  
 |  Parameters
 |  ----------
 |  categories : 'auto' or a list of array-like, default='auto'
 |      Categories (unique values) per feature:
 |  
 |      - 'auto' : Determine categories automatically from the training data.
 |      - list : ``categories[i]`` holds the categories expected in the ith
 |        column. The passed categories should n

In [None]:
ordinal_encoder = OrdinalEncoder()

df_happiness_ordinal = df_happiness.copy()
df_happiness_ordinal[categorical_columns] = ordinal_encoder.fit_transform(df_happiness_ordinal[categorical_columns])

df_happiness_ordinal.sample(10)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
680,0.0,1.0,4.0,26.0,1.0,,,7.8,,,7.8,,,8.8,8.5
1429,2.0,0.0,0.0,12.0,1.0,,,7.1,,,8.2,,,8.6,7.1
1809,2.0,1.0,1.0,11.0,1.0,,,7.1,,7.7,7.8,,,8.6,6.9
53,0.0,0.0,0.0,27.0,1.0,,,6.5,,,8.2,,,8.6,7.4
281,0.0,0.0,4.0,1.0,0.0,8.5,,6.6,8.6,,7.5,8.8,8.1,8.9,8.3
60,0.0,0.0,0.0,31.0,0.0,8.3,,7.7,7.9,,7.7,7.4,7.4,8.2,7.4
1359,1.0,1.0,4.0,11.0,0.0,7.5,,6.3,6.9,,7.2,7.8,7.8,8.3,8.0
1094,1.0,1.0,0.0,20.0,1.0,,,6.3,,7.8,7.3,,,7.7,7.2
1065,1.0,1.0,0.0,6.0,0.0,7.7,7.1,6.2,7.5,6.9,7.6,7.6,7.6,8.2,7.1
901,1.0,0.0,2.0,30.0,1.0,,,4.2,,5.9,5.6,,,7.4,5.1


In [None]:
ordinal_encoder.categories_

[array(['ED0-2', 'ED3_4', 'ED5-8', 'ED5_6'], dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Y16-24', 'Y25-34', 'Y35-49', 'Y50-64', 'Y65-74'], dtype=object),
 array(['AL', 'AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL',
        'ES', 'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV',
        'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI',
        'SK', 'TR', 'UK', 'XK'], dtype=object),
 array([2013, 2018])]

In [None]:
ordinal_encoder.inverse_transform(df_happiness_ordinal[categorical_columns])

array([['ED0-2', 'F', 'Y16-24', 'AL', 2018],
       ['ED0-2', 'F', 'Y16-24', 'AT', 2013],
       ['ED0-2', 'F', 'Y16-24', 'AT', 2018],
       ...,
       ['ED5_6', 'M', 'Y65-74', 'SE', 2013],
       ['ED5_6', 'M', 'Y65-74', 'TR', 2013],
       ['ED5_6', 'M', 'Y65-74', 'UK', 2013]], dtype=object)

In [None]:
ordinal_encoder = OrdinalEncoder(categories=[['ED0-2', 'ED3_4', 'ED5_6', 'ED5-8'],
                                             ['M', 'F'],
                                             list(df_happiness['age'].unique()),
                                             list(df_happiness['geo'].unique()),
                                             [2013, 2018]])

df_happiness_ordinal = df_happiness.copy()
df_happiness_ordinal[categorical_columns] = ordinal_encoder.fit_transform(df_happiness_ordinal[categorical_columns])

df_happiness_ordinal.sample(10)

Unnamed: 0,isced11,sex,age,geo,time,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT
948,1.0,1.0,3.0,18.0,1.0,,,6.6,,7.3,7.3,,,7.7,6.7
1379,1.0,0.0,4.0,21.0,0.0,6.7,7.4,5.0,7.7,7.5,6.0,7.5,7.2,8.1,7.1
1936,3.0,0.0,3.0,5.0,0.0,7.9,7.7,5.9,6.3,7.4,6.4,6.7,7.5,8.2,6.9
12,0.0,1.0,0.0,7.0,0.0,7.6,6.8,6.3,7.2,7.2,7.4,7.2,7.2,8.0,6.5
1107,1.0,0.0,0.0,27.0,0.0,7.4,7.4,6.1,7.5,7.0,8.0,7.5,7.8,8.3,7.5
1094,1.0,0.0,0.0,20.0,1.0,,,6.3,,7.8,7.3,,,7.7,7.2
55,0.0,1.0,0.0,28.0,1.0,,,6.7,,7.9,8.3,,,8.9,7.2
1055,1.0,0.0,0.0,0.0,1.0,,,5.0,,5.3,5.8,,,7.2,5.9
2042,3.0,0.0,4.0,23.0,0.0,7.5,,6.2,6.9,,6.6,7.1,7.8,8.7,7.9
1918,3.0,0.0,2.0,31.0,0.0,7.8,7.6,7.6,8.2,7.5,8.0,7.8,7.9,7.8,6.3


In [None]:
ordinal_encoder.categories_

[array(['ED0-2', 'ED3_4', 'ED5_6', 'ED5-8'], dtype=object),
 array(['M', 'F'], dtype=object),
 array(['Y16-24', 'Y25-34', 'Y35-49', 'Y50-64', 'Y65-74'], dtype=object),
 array(['AL', 'AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL',
        'ES', 'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV',
        'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI',
        'SK', 'TR', 'UK', 'XK'], dtype=object),
 array([2013, 2018])]

## OneHot Encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
help(OneHotEncoder)

Help on class OneHotEncoder in module sklearn.preprocessing._encoders:

class OneHotEncoder(_BaseEncoder)
 |  OneHotEncoder(categories='auto', drop=None, sparse=True, dtype=<class 'numpy.float64'>, handle_unknown='error')
 |  
 |  Encode categorical features as a one-hot numeric array.
 |  
 |  The input to this transformer should be an array-like of integers or
 |  strings, denoting the values taken on by categorical (discrete) features.
 |  The features are encoded using a one-hot (aka 'one-of-K' or 'dummy')
 |  encoding scheme. This creates a binary column for each category and
 |  returns a sparse matrix or dense array (depending on the ``sparse``
 |  parameter)
 |  
 |  By default, the encoder derives the categories based on the unique values
 |  in each feature. Alternatively, you can also specify the `categories`
 |  manually.
 |  
 |  This encoding is needed for feeding categorical data to many scikit-learn
 |  estimators, notably linear models and SVMs with the standard kernel

In [None]:
onehot_encoder = OneHotEncoder(handle_unknown='ignore',
                               sparse=False)

df_happiness_onehot = df_happiness.copy()
OH_categorical = pd.DataFrame(onehot_encoder.fit_transform(df_happiness_onehot[categorical_columns]))
OH_categorical.index = df_happiness_onehot.index
df_happiness_onehot.drop(categorical_columns, axis=1, inplace=True)
df_happiness_onehot = pd.concat([df_happiness_onehot, OH_categorical], axis=1)

df_happiness_onehot

Unnamed: 0,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,,,4.9,,5.6,6.1,,,7.2,5.9,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,,,7.5,,8.1,8.0,,,8.9,7.3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7.8,,6.7,6.7,,7.7,7.4,8.0,8.1,7.7,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,,,6.8,,,7.8,,,8.0,7.6,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,8.2,,7.5,6.7,,7.9,7.1,8.4,8.9,7.6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2141,8.0,,7.1,7.4,,7.5,7.3,7.6,7.9,7.9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2142,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2143,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
onehot_encoder.categories_

[array(['ED0-2', 'ED3_4', 'ED5-8', 'ED5_6'], dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Y16-24', 'Y25-34', 'Y35-49', 'Y50-64', 'Y65-74'], dtype=object),
 array(['AL', 'AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL',
        'ES', 'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV',
        'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI',
        'SK', 'TR', 'UK', 'XK'], dtype=object),
 array([2013, 2018])]

In [None]:
onehot_encoder.categories_

[array(['ED0-2', 'ED3_4', 'ED5-8', 'ED5_6'], dtype=object),
 array(['F', 'M'], dtype=object),
 array(['Y16-24', 'Y25-34', 'Y35-49', 'Y50-64', 'Y65-74'], dtype=object),
 array(['AL', 'AT', 'BE', 'BG', 'CH', 'CY', 'CZ', 'DE', 'DK', 'EE', 'EL',
        'ES', 'FI', 'FR', 'HR', 'HU', 'IE', 'IS', 'IT', 'LT', 'LU', 'LV',
        'ME', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RS', 'SE', 'SI',
        'SK', 'TR', 'UK', 'XK'], dtype=object),
 array([2013, 2018])]

In [None]:
column_names = [f'b_{val}' for cat in onehot_encoder.categories_ for val in cat]
column_names

onehot_encoder = OneHotEncoder(handle_unknown='ignore',
                               sparse=False)

df_happiness_onehot = df_happiness.copy()
OH_categorical = onehot_encoder.fit_transform(df_happiness_onehot[categorical_columns])
column_names = [val for cat in onehot_encoder.categories_ for val in cat]

OH_categorical = pd.DataFrame(OH_categorical, columns=column_names)
OH_categorical.index = df_happiness_onehot.index
df_happiness_onehot.drop(categorical_columns, axis=1, inplace=True)
df_happiness_onehot = pd.concat([df_happiness_onehot, OH_categorical], axis=1)

df_happiness_onehot

Unnamed: 0,ACCSAT,COMSAT,FINSAT,GREENSAT,JOBSAT,LIFESAT,LIVENVSAT,MEANLIFE,RELSAT,TIMESAT,ED0-2,ED3_4,ED5-8,ED5_6,F,M,Y16-24,Y25-34,Y35-49,Y50-64,Y65-74,AL,AT,BE,BG,CH,CY,CZ,DE,DK,EE,EL,ES,FI,FR,HR,HU,IE,IS,IT,LT,LU,LV,ME,MK,MT,NL,NO,PL,PT,RO,RS,SE,SI,SK,TR,UK,XK,2013,2018
0,,,4.9,,5.6,6.1,,,7.2,5.9,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,8.6,8.4,7.5,8.1,8.5,8.2,8.2,7.8,8.8,7.2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,,,7.5,,8.1,8.0,,,8.9,7.3,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,7.8,,6.7,6.7,,7.7,7.4,8.0,8.1,7.7,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,,,6.8,,,7.8,,,8.0,7.6,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2140,8.2,,7.5,6.7,,7.9,7.1,8.4,8.9,7.6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2141,8.0,,7.1,7.4,,7.5,7.3,7.6,7.9,7.9,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2142,8.9,8.6,8.8,8.9,8.7,8.4,8.3,8.3,8.7,8.6,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2143,7.2,6.8,6.1,6.4,7.0,6.2,7.3,7.2,7.7,6.5,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [None]:
df_happiness_onehot.shape

(2145, 60)

## Label Encoding


In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder = LabelEncoder()

df_iris_encoded = df_iris.copy()
df_iris_encoded['class'] = label_encoder.fit_transform(df_iris_encoded['class'])

df_iris_encoded.sample(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
63,6.1,2.9,4.7,1.4,1
121,5.6,2.8,4.9,2.0,2
10,5.4,3.7,1.5,0.2,0
24,4.8,3.4,1.9,0.2,0
109,7.2,3.6,6.1,2.5,2
61,5.9,3.0,4.2,1.5,1
88,5.6,3.0,4.1,1.3,1
4,5.0,3.6,1.4,0.2,0
20,5.4,3.4,1.7,0.2,0
35,5.0,3.2,1.2,0.2,0


In [None]:
label_encoder.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [None]:
label_encoder.inverse_transform(df_iris_encoded['class'])

array(['Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
       'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versic