In [1]:
from ipynb.fs.full.my_functions import *
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2

In [15]:
data = pd.read_csv(r'C:\Users\dvale\TFM\Data\variables.csv')

In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32514 entries, 0 to 32513
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   id                         32514 non-null  int64  
 1   weight                     32514 non-null  float64
 2   material_deprivation       32514 non-null  object 
 3   sex                        32514 non-null  object 
 4   age                        32514 non-null  int64  
 5   civil_status               32514 non-null  object 
 6   familial_status            32514 non-null  object 
 7   region                     32514 non-null  object 
 8   population_density         32514 non-null  object 
 9   citizenship                32514 non-null  object 
 10  tenure_status              32514 non-null  object 
 11  education_level            32514 non-null  object 
 12  working_status             32514 non-null  object 
 13  occupation                 32514 non-null  obj

In [17]:
#First I scale the numerical variables. I use MinMaxScaler. Since I intend to enconde my categorical variables this way all
#my variables will have values between 0 and 1.

num_var = data[['age', 'years_worked', 'hours_week_worked', 'adjusted_income', 'proportion_social_welfare']]

scaler = MinMaxScaler()

scaler.fit(num_var)
scaled_var = pd.DataFrame(scaler.transform(num_var), columns=num_var.columns)
scaled_var.head()

Unnamed: 0,age,years_worked,hours_week_worked,adjusted_income,proportion_social_welfare
0,0.761194,0.723077,0.0,0.222241,0.0
1,0.731343,0.030769,0.0,0.222241,0.0
2,0.791045,0.307692,0.0,0.258452,0.0
3,0.61194,0.538462,0.0,0.258452,0.0
4,0.522388,0.461538,0.0,0.124813,0.0


In [18]:
#I weight the DataFrame before applying the hipothesis tests.

#weighted_df from my_functions takes a dataframe and returns a weighted version of the inputed dataframe.

scaled_var['weight'] = data.weight
scaled_var['material_deprivation'] = data.material_deprivation.map({'Yes': 1, 'No': 0})

weighted_scaled_var = weighted_df(scaled_var)

In [19]:
#I perform the analysis of variance

#weighted_cat from my_functions takes a categorical column and returns a weighted version of the inputed column.

X = weighted_scaled_var[weighted_scaled_var.columns[:-1]]
y = weighted_scaled_var.material_deprivation
                                                                 
fs = SelectKBest(score_func=f_classif, k='all').fit(X, y)

In [20]:
#I want to know which of my scaled variables got better pvalue scores.

#This dictionary has the pvalues as keys and the variable names as values.
scaled_pvalues = {}
for i in range(len(fs.pvalues_)):
    scaled_pvalues[fs.pvalues_[i]] = scaled_var.columns[i]

#Having the pvalues as keys makes sorting them easy. Below a list of all pvalues from smallest to largest.
for i in range(len(sorted(scaled_pvalues))):
    print('{}: {}'.format(scaled_pvalues[sorted(scaled_pvalues)[i]], sorted(scaled_pvalues)[i]))

proportion_social_welfare: 3.305819271e-314
adjusted_income: 4.620093380549222e-269
hours_week_worked: 2.3759435328523007e-65
years_worked: 3.6405602596470017e-47
age: 4.5604098647039964e-29


In [43]:
#I weigth my categorical columns.

cat_var = data[['sex', 
                'civil_status', 
                'familial_status', 
                'region', 
                'population_density', 
                'citizenship', 
                'tenure_status',
                'education_level',
                'working_status', 
                'occupation',
                'bad_health']].reset_index(drop=True)

cat_var['weight'] = data.weight
cat_var['material_deprivation'] = data.material_deprivation.map({'Yes': 1, 'No': 0})

weighted_cat_var = weighted_df(cat_var)

In [44]:
#Saving and droping my independent variable after weighting. Do not need it in the dataframe for the encoding.

weighted_material_deprivation = weighted_cat_var.material_deprivation.copy()
weighted_cat_var = weighted_cat_var.drop(['material_deprivation'], axis=1)

In [55]:
#Defining the categories for the OneHotEncoder categories parameter.

categories = []
for i in range(len(weighted_cat_var.columns)):
    categories.append([])
    for value in weighted_cat_var[weighted_cat_var.columns[i]].unique():
        categories[i].append(value)

for i in range(len(categories)):
    print(categories[i])

['Male', 'Female']
['Married', 'Divorced', 'Never married', 'Widowed', 'Separated', "Married 'de facto'"]
['No', 'Yes']
['Basque Country', 'Navarre', 'Castile–La Mancha', 'Valencian Community', 'Andalusia', 'Castile and Leon', 'Extremadura', 'Balearic Islands', 'Catalonia', 'Galicia', 'Aragon', 'Comunity of Madrid', 'La Rioja', 'Region of Murcia', 'Asturias', 'Canary Islands', 'Cantabria', 'Ceuta', 'Melilla']
['Thinly-populated area', 'Intermediate area', 'Densely-populated area']
['Spain', 'Spain (naturalized)', 'Other (outside EU)', 'Other (EU)']
['Outright owner', 'Tenancy at reduced rate', 'Owner paying mortgage', 'Tenancy at market rate', 'Free tenancy']
['Upper secondary education', 'Lower secondary education', 'Higher education', 'Primary education', 'Pre-primary education']
['Retired', 'Unpaid carer/domestic worker', 'Disabled/unfit to work', 'Unemployed', 'Student', 'Employed']
['Clerical Support Workers', 'Elementary Occupations', 'Services and Sales Workers', 'Professionals'

In [56]:
#Encoding the dataframe

ohe = OneHotEncoder(categories=categories)
ohe.fit(weighted_cat_var)
onehot_var = ohe.transform(weighted_cat_var).toarray()

onehot_var

array([[1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.]])

In [37]:
#Performing the chi square test.

X = onehot_var
y = weighted_material_deprivation

ch2 = SelectKBest(score_func=chi2, k='all').fit(X, y)

In [76]:
#Creating new names for the encoded columns

onehot_var_names = []

for i in range(len(categories)):
    for value in categories[i]:
        onehot_var_names.append(weighted_cat_var.columns[i] + '_' + value.lower().replace(' ', '_'))
        
onehot_var_names[:5]

['sex_male',
 'sex_female',
 'civil_status_married',
 'civil_status_divorced',
 'civil_status_never_married']

In [85]:
#Just as before I want to know which variables got better pvalue score.

#Dictionary with the pvalues as keys and the variable names as values.
one_hot_pvalues = {}
for i in range(len(ch2.pvalues_)):
    one_hot_pvalues[ch2.pvalues_[i]] = onehot_var_names[i]
    

#This dictionary filters out variables with less than 99.9% significance. 
one_hot_pvalues_filtered = {}
for pvalue in one_hot_pvalues:
    if pvalue < 0.001:
        one_hot_pvalues_filtered[pvalue] = one_hot_pvalues[pvalue]

#List of all filtered pvalues from smallest to largest.
for i in range(len(sorted(one_hot_pvalues_filtered))):
    print('{}: {}'.format(one_hot_pvalues_filtered[sorted(one_hot_pvalues_filtered)[i]], sorted(one_hot_pvalues_filtered)[i]))

working_status_unemployed: 2.727955678038347e-173
citizenship_other_(outside_eu): 1.0139705248596816e-132
tenure_status_tenancy_at_market_rate: 2.3105180371697858e-127
tenure_status_tenancy_at_reduced_rate: 1.764439769952361e-80
tenure_status_outright_owner: 2.9771517382632965e-47
education_level_higher_education: 3.775663616756461e-42
civil_status_separated: 2.676768005874168e-40
occupation_elementary_occupations: 6.857205692560805e-36
citizenship_spain_(naturalized): 2.269266215402648e-35
working_status_employed: 3.282685508336681e-27
occupation_professionals: 7.648078221060657e-26
education_level_pre-primary_education: 5.750583787068679e-25
citizenship_spain: 1.3633267512711312e-24
working_status_disabled/unfit_to_work: 3.474451317250032e-22
working_status_retired: 1.0957312233932373e-20
occupation_clerical_support_workers: 2.027698232311139e-18
bad_health_yes: 8.584112049883995e-17
civil_status_married: 4.294148595131259e-15
region_andalusia: 2.858940305340912e-12
civil_status_neve

In [117]:
#Resulting dataframe after weighting and feature engeeniring.

one_hot_df = pd.DataFrame({onehot_var_names[i]: onehot_var.transpose()[i] for i in range(len(onehot_var_names))})

all_features = pd.concat([one_hot_df, weighted_scaled_var], axis=1)

all_features.head()

Unnamed: 0,sex_male,sex_female,civil_status_married,civil_status_divorced,civil_status_never_married,civil_status_widowed,civil_status_separated,civil_status_married_'de_facto',familial_status_no,familial_status_yes,...,occupation_food_preparation_assistants,"occupation_skilled_agricultural,_forestry_and_fishery_workers",bad_health_yes,bad_health_no,age,years_worked,hours_week_worked,adjusted_income,proportion_social_welfare,material_deprivation
0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.761194,0.723077,0.0,0.222241,0.0,0
1,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.731343,0.030769,0.0,0.222241,0.0,0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.791045,0.307692,0.0,0.258452,0.0,0
3,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.61194,0.538462,0.0,0.258452,0.0,0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.522388,0.461538,0.0,0.124813,0.0,0


In [124]:
#Selected feature list

selected_cat_features = [one_hot_pvalues_filtered[key] for key in one_hot_pvalues_filtered]
selected_num_features = [scaled_pvalues[key] for key in scaled_pvalues]
selected_features = selected_cat_features + selected_num_features
selected_features.append('material_deprivation')

In [125]:
selected_features_df = all_features[selected_features]
selected_features_df.head()

Unnamed: 0,civil_status_married,civil_status_never_married,civil_status_separated,region_basque_country,region_castile–la_mancha,region_andalusia,region_castile_and_leon,region_cantabria,population_density_thinly-populated_area,citizenship_spain,...,occupation_non-defined,occupation_food_preparation_assistants,bad_health_yes,bad_health_no,age,years_worked,hours_week_worked,adjusted_income,proportion_social_welfare,material_deprivation
0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.761194,0.723077,0.0,0.222241,0.0,0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.731343,0.030769,0.0,0.222241,0.0,0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.791045,0.307692,0.0,0.258452,0.0,0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,1.0,0.61194,0.538462,0.0,0.258452,0.0,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.522388,0.461538,0.0,0.124813,0.0,0


In [126]:
#Exporting final table for modeling

selected_features_df.to_csv('to_model.csv', index=False)