In [3]:
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns


## Data Exploration

In [6]:
data = pd.read_csv('data/data_welfare.csv')
data = data.dropna()
print(data.shape)
print(data.columns)

(4999, 27)
Index([&#39;Sex&#39;, &#39;Age&#39;, &#39;Occupation&#39;, &#39;Education&#39;, &#39;Elderly&#39;, &#39;Disabled&#39;,
       &#39;Chronic_Patient&#39;, &#39;Self_Reliance&#39;, &#39;Province_ID&#39;, &#39;Amphur_ID&#39;,
       &#39;District_Type&#39;, &#39;Main&#39;, &#39;Accommodation_Own&#39;, &#39;Accommodation_Rent&#39;,
       &#39;Accommodation_Public&#39;, &#39;Accommodation_Other&#39;,
       &#39;Accommodation_Other_Name&#39;, &#39;Accommodation_Doc_Type&#39;, &#39;Occupation_Own&#39;,
       &#39;Occupation_Rent&#39;, &#39;Occupation_Public&#39;, &#39;Occupation_Other&#39;,
       &#39;Occupation_Other_Name&#39;, &#39;Occupation_Doc_Type&#39;, &#39;Happiness&#39;,
       &#39;Owner_Income&#39;, &#39;Label&#39;],
      dtype=&#39;object&#39;)


In [7]:
data.head()

Unnamed: 0,Sex,Age,Occupation,Education,Elderly,Disabled,Chronic_Patient,Self_Reliance,Province_ID,Amphur_ID,...,Accommodation_Doc_Type,Occupation_Own,Occupation_Rent,Occupation_Public,Occupation_Other,Occupation_Other_Name,Occupation_Doc_Type,Happiness,Owner_Income,Label
0,ญ,43,เกษตร-ทำนา,"ประถมฯ (ป.4, ป.7, ป.6)",0,0,0,0,34,3407,...,"โฉนดที่ดิน (น.ส.4, น.ส.4 ก, น.ส.4 ข, น.ส.4 ค, ...",10,0,0,0,0,"โฉนดที่ดิน (น.ส.4, น.ส.4 ก, น.ส.4 ข, น.ส.4 ค, ...",10,100000.0,1
1,ญ,54,ธุรกิจส่วนตัว,อนุปริญญา หรือเทียบเท่า หรือ ปวส.,0,0,0,0,34,3401,...,"หนังสือรับรองการทำประโยชน์ (น.ส.3, น.ส.3 ก หรื...",0,0,0,0,0,0,10,100000.0,1
2,ช,22,รับจ้างทั่วไป,ม.ปลาย (มศ.4-5 หรือ ม.4-6 หรือ ปวช.),0,0,0,0,57,5701,...,"โฉนดที่ดิน (น.ส.4, น.ส.4 ก, น.ส.4 ข, น.ส.4 ค, ...",0,0,0,0,0,0,9,100000.0,1
3,ญ,49,รับจ้างทั่วไป,ม.ปลาย (มศ.4-5 หรือ ม.4-6 หรือ ปวช.),0,0,0,0,71,7105,...,0,0,0,0,0,0,0,7,100000.0,1
4,ช,45,เกษตร-ทำนา,"ประถมฯ (ป.4, ป.7, ป.6)",0,0,0,0,40,4019,...,แบบยื่นภาษีบำรุงท้องที่ (ภ.บ.ท.5),29,0,0,0,0,"หนังสือรับรองการทำประโยชน์ (น.ส.3, น.ส.3 ก หรื...",7,100000.0,1


Check if there is 'NULL'.

In [8]:
data.isnull().sum()

Sex                         0
Age                         0
Occupation                  0
Education                   0
Elderly                     0
Disabled                    0
Chronic_Patient             0
Self_Reliance               0
Province_ID                 0
Amphur_ID                   0
District_Type               0
Main                        0
Accommodation_Own           0
Accommodation_Rent          0
Accommodation_Public        0
Accommodation_Other         0
Accommodation_Other_Name    0
Accommodation_Doc_Type      0
Occupation_Own              0
Occupation_Rent             0
Occupation_Public           0
Occupation_Other            0
Occupation_Other_Name       0
Occupation_Doc_Type         0
Happiness                   0
Owner_Income                0
Label                       0
dtype: int64

In [9]:
data['Label'].value_counts()

0    2500
1    2499
Name: Label, dtype: int64

In [10]:
data.groupby('Label').mean()

Unnamed: 0_level_0,Age,Elderly,Disabled,Chronic_Patient,Self_Reliance,Province_ID,Amphur_ID,District_Type,Main,Accommodation_Own,Accommodation_Rent,Accommodation_Public,Accommodation_Other,Occupation_Own,Occupation_Rent,Occupation_Public,Occupation_Other,Happiness,Owner_Income
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,48.2208,0.0,0.0,0.0,0.0,47.906,4796.1432,5.3084,394595.8704,0.672,0.0024,0.002,0.0172,5.068,0.8676,0.0028,0.0568,8.27,236748.7344
1,54.136855,0.0,0.0,0.0,0.0,50.763505,5082.332533,5.569428,179911.078031,0.543017,0.012005,0.003201,0.016407,4.756703,0.72429,0.061625,0.19928,8.113645,81374.855819


ผลลัพธ์ข้างต้นชี้ให้เห็นว่า คอลัมน์ Elderly, Disabled, Chronic_Patient และ Self_Reliance ไม่มีผลต่อการแบ่งกลุ่มคนได้สวัสดิการครัวเรือน

In [11]:
data.groupby('Education').mean()

Unnamed: 0_level_0,Age,Elderly,Disabled,Chronic_Patient,Self_Reliance,Province_ID,Amphur_ID,District_Type,Main,Accommodation_Own,Accommodation_Rent,Accommodation_Public,Accommodation_Other,Occupation_Own,Occupation_Rent,Occupation_Public,Occupation_Other,Happiness,Owner_Income,Label
Education,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
"ต่ำกว่าชั้นประถมฯ (ป.4, ป.7, ป.6)",62.5,0.0,0.0,0.0,0.0,52.972222,5302.861111,5.462963,303987.12963,0.546296,0.009259,0.0,0.018519,7.694444,4.175926,0.0,0.0,7.981481,164989.175926,0.731481
ป.ตรี หรือเทียบเท่า,45.610478,0.0,0.0,0.0,0.0,40.876993,4092.940774,5.143508,411908.187927,0.462415,0.0,0.001139,0.001139,3.061503,0.189066,0.0,0.035308,8.307517,271120.457099,0.036446
"ประถมฯ (ป.4, ป.7, ป.6)",56.388614,0.0,0.0,0.0,0.0,52.730198,5279.023927,5.563531,250557.452145,0.681931,0.014439,0.002475,0.023927,6.074257,0.79538,0.044967,0.166254,8.129125,112666.759872,0.739274
ม.ต้น (มศ.1-3 หรือ ม.1-3),45.498952,0.0,0.0,0.0,0.0,55.861635,5592.450734,5.477987,221935.161426,0.872117,0.0,0.002096,0.010482,5.631027,1.9413,0.0,0.1174,8.12369,131310.621244,0.57652
ม.ปลาย (มศ.4-5 หรือ ม.4-6 หรือ ปวช.),45.67128,0.0,0.0,0.0,0.0,49.83564,4989.224913,5.323529,272908.650519,0.513841,0.0,0.0,0.010381,4.252595,0.716263,0.022491,0.100346,8.219723,160230.713215,0.349481
สูงกว่าปริญญาตรี,49.339286,0.0,0.0,0.0,0.0,46.044643,4609.651786,4.830357,520362.767857,0.321429,0.0,0.044643,0.0,3.178571,0.0,0.044643,0.0,8.696429,306125.035714,0.0
อนุบาล/ศูนย์เด็กเล็ก,73.0,0.0,0.0,0.0,0.0,20.0,2007.0,6.0,168000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,84000.0,1.0
อนุปริญญา หรือเทียบเท่า หรือ ปวส.,39.03,0.0,0.0,0.0,0.0,30.773333,3081.986667,5.553333,279428.783333,0.183333,0.0,0.0,0.003333,0.97,0.12,0.016667,0.006667,8.336667,210987.153333,0.086667
ไม่เคยศึกษา,57.429752,0.0,0.0,0.0,0.0,60.61157,6067.661157,5.735537,234451.735537,0.950413,0.0,0.0,0.090909,4.322314,0.479339,0.239669,0.743802,8.115702,110298.461629,0.760331


## Categorizing Columns

In [12]:
data['Sex'] = data['Sex'].astype('category')
data['Sex_Code'] = data['Sex'].cat.codes
data['Occupation'] = data['Occupation'].astype('category')
data['Occupation_Code'] = data['Occupation'].cat.codes
data['Education'] = data['Education'].astype('category')
data['Education_Code'] = data['Education'].cat.codes
data['Occupation_Doc_Type'] = data['Occupation_Doc_Type'].astype('category')
data['Occupation_Doc_Type_Code'] = data['Occupation_Doc_Type'].cat.codes
data['Accommodation_Doc_Type'] = data['Accommodation_Doc_Type'].astype('category')
data['Accommodation_Doc_Type_Code'] = data['Accommodation_Doc_Type'].cat.codes

parameters = ['Sex_Code','Age','Occupation_Code','Education_Code','Province_ID','Amphur_ID','District_Type','Main','Accommodation_Own','Accommodation_Rent','Accommodation_Public','Accommodation_Other','Accommodation_Doc_Type_Code','Occupation_Own','Occupation_Rent','Occupation_Public','Occupation_Other','Occupation_Doc_Type_Code','Happiness','Owner_Income']

adjusted_data = data[parameters]
adjusted_label = data[['Label']]

print(adjusted_data.columns.values)
adjusted_data.head()


[&#39;Sex_Code&#39; &#39;Age&#39; &#39;Occupation_Code&#39; &#39;Education_Code&#39; &#39;Province_ID&#39;
 &#39;Amphur_ID&#39; &#39;District_Type&#39; &#39;Main&#39; &#39;Accommodation_Own&#39;
 &#39;Accommodation_Rent&#39; &#39;Accommodation_Public&#39; &#39;Accommodation_Other&#39;
 &#39;Accommodation_Doc_Type_Code&#39; &#39;Occupation_Own&#39; &#39;Occupation_Rent&#39;
 &#39;Occupation_Public&#39; &#39;Occupation_Other&#39; &#39;Occupation_Doc_Type_Code&#39;
 &#39;Happiness&#39; &#39;Owner_Income&#39;]


Unnamed: 0,Sex_Code,Age,Occupation_Code,Education_Code,Province_ID,Amphur_ID,District_Type,Main,Accommodation_Own,Accommodation_Rent,Accommodation_Public,Accommodation_Other,Accommodation_Doc_Type_Code,Occupation_Own,Occupation_Rent,Occupation_Public,Occupation_Other,Occupation_Doc_Type_Code,Happiness,Owner_Income
0,1,43,8,2,34,3407,6,100000,0,0,0,0,8,10,0,0,0,8,10,100000.0
1,1,54,2,7,34,3401,3,300000,0,0,0,0,2,0,0,0,0,0,10,100000.0
2,0,22,6,4,57,5701,6,100000,1,0,0,0,8,0,0,0,0,0,9,100000.0
3,1,49,6,4,71,7105,5,100000,0,0,0,0,0,0,0,0,0,0,7,100000.0
4,0,45,8,2,40,4019,5,200000,0,0,0,0,6,29,0,0,0,2,7,100000.0


In [13]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

train_data, test_data, train_label, test_label = train_test_split(
    adjusted_data, 
    adjusted_label,
    test_size=0.25, 
    random_state = 1972)

model = LogisticRegression()
rfe = RFE(model, 2)
rfe = rfe.fit(train_data, train_label)
print(rfe.support_)
print(rfe.ranking_)

[False False False False False False  True False False  True False False
 False False False False False False False False]
[ 5 13  4  6 16 17  1 19 11  1  3  7 10 14 15  2  8 12  9 18]


In [14]:
parameters
impacted_param = [
    'Sex_Code',
    'Age',
    'Occupation_Code',
    'Education_Code',
    'Province_ID',
    'Amphur_ID',
    'District_Type',
    'Main',
    'Accommodation_Own',
    'Accommodation_Rent',
    'Accommodation_Public',
    'Accommodation_Other',
    'Accommodation_Doc_Type_Code',
    'Occupation_Own',
    'Occupation_Rent',
    'Occupation_Public',
    'Occupation_Other',
    'Occupation_Doc_Type_Code',
    'Happiness',
    'Owner_Income']

X = train_data[impacted_param]
y = train_label

In [16]:
import statsmodels.api as sm

logit_model = sm.Logit(y, X)
result = logit_model.fit()
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.042608
         Iterations 15
                               Results: Logit
Model:                    Logit                Pseudo R-squared:     0.939   
Dependent Variable:       Label                AIC:                  359.4762
Date:                     2020-10-06 22:24     BIC:                  484.0611
No. Observations:         3749                 Log-Likelihood:       -159.74 
Df Model:                 19                   LL-Null:              -2598.5 
Df Residuals:             3729                 LLR p-value:          0.0000  
Converged:                1.0000               Scale:                1.0000  
No. Iterations:           15.0000                                            
-----------------------------------------------------------------------------
                             Coef.  Std.Err.    z     P&gt;|z|   [0.025   0.975]
--------------------------------------------------------------------