#  Average Household Income in Thailand
## Import required libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.graph_objs as go
import feather
import re

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

from IPython.core.display import display
from requests import get
from bs4 import BeautifulSoup

from plotly.offline import init_notebook_mode, iplot

init_notebook_mode(connected=True)

## Load & Clean Data

### 1. ข้อมูลรายได้เฉลี่ยต่อครัวเรือนแต่ละจังหวัด

In [2]:
# ข้อมูลจาก https://data.go.th/default.aspx
df = pd.read_excel('./data/รายได้เฉลี่ยต่อเดือนต่อครัวเรือน 41-58.xls')

col_names = ['ภาค และจังหวัด'] + [int(y) for y in df.iloc[2,2:12].values.tolist()]

df_cut = pd.DataFrame(df.iloc[5:88,[0,2,3,4,5,6,7,8,9,10,11]].values,columns=col_names)

df_cut = df_cut[df_cut['ภาค และจังหวัด'].apply(lambda x: 'ภาค' not in x)]

df_cut = df_cut.drop(index=[0,1]).reset_index(drop=True)

df_cut['ภาค และจังหวัด']=df_cut['ภาค และจังหวัด'].apply(lambda x:re.sub(r'\s+','',x))

### 2. ข้อมูล Lat Lon แต่ละจังหวัด
จากเว็บไซต์ [https://tha.timegenie.com/latitude_longitude/country/th](https://tha.timegenie.com/latitude_longitude/country/th)

![](docs/household-income-2.png) 

In [3]:
url = 'https://tha.timegenie.com/latitude_longitude/country/th'

response = get(url)

html_soup = BeautifulSoup(response.text, 'html.parser')

lat = html_soup.find_all('h5')
lat = lat[:-11]

lon = html_soup.find_all('span')
lon = lon[1:-9]

lat = [l.text for l in lat]
lat = [l for l in lat if "N" not in l]

lon = [l.text for l in lon]
lon = [l for l in lon if "E" not in l]

lat_num = [l for l in lat if re.search(r'\d+.\d+',l)]

lon_num = [l for l in lon if re.search(r'\d+.\d+',l)]

prov = [p for p in lat if not re.search(r'\d+',p)][1:]

print(len(lat_num), len(lon_num), len(prov))

87 87 87


In [4]:
df_coor = pd.DataFrame({'province':prov,'lat':lat_num,'lon':lon_num})
df_coor.head()

Unnamed: 0,lat,lon,province
0,13.72917,100.52389,กรุงเทพมหานคร
1,13.59556,100.60722,สมุทรปราการ
2,17.53917,102.78444,อุดรธานี
3,13.85083,100.52222,นนทบุรี
4,13.36222,100.98333,ชลบุรี


### 3. Merge ข้อมูลจากข้อ 1 & 2

In [5]:
df_all = df_cut.merge(df_coor,how='left',left_on='ภาค และจังหวัด',right_on='province')
df_all.drop('province',axis=1,inplace=True)
df_all.columns = [str(col) for col in df_all.columns]
df_all.head()

Unnamed: 0,ภาค และจังหวัด,2541,2543,2545,2547,2549,2550,2552,2554,2556,2558,lat,lon
0,กรุงเทพมหานคร,26054,26909,29589,29843,36658,39020,42380,48951.0,49190.8,45571.7,13.72917,100.52389
1,สมุทรปราการ,18100,15745,19680,19946,20382,21302,23359,23797.9,29575.3,25457.2,13.59556,100.60722
2,นนทบุรี,24211,24566,29119,26658,31152,32743,34626,35119.7,30663.6,36884.0,13.85083,100.52222
3,ปทุมธานี,21793,19282,22838,21530,25143,26107,26686,21615.5,33461.3,41056.9,14.05,100.48333
4,พระนครศรีอยุธยา,12918,14904,13319,14980,19676,21676,25820,22301.7,26481.5,28379.4,14.35361,100.56917


### 4. Save & Load from check point

In [6]:
# df_all.to_feather('raw-df_all')

# df_all = feather.read_dataframe('raw-df_all')
# df_all.dropna(inplace=True)
# df_all.head()

Unnamed: 0,ภาค และจังหวัด,2541,2543,2545,2547,2549,2550,2552,2554,2556,2558,lat,lon
0,กรุงเทพมหานคร,26054.0,26909.0,29589.0,29843.0,36658.0,39020.0,42380.0,48951.0,49190.8,45571.7,13.72917,100.52389
1,สมุทรปราการ,18100.0,15745.0,19680.0,19946.0,20382.0,21302.0,23359.0,23797.9,29575.3,25457.2,13.59556,100.60722
2,นนทบุรี,24211.0,24566.0,29119.0,26658.0,31152.0,32743.0,34626.0,35119.7,30663.6,36884.0,13.85083,100.52222
3,ปทุมธานี,21793.0,19282.0,22838.0,21530.0,25143.0,26107.0,26686.0,21615.5,33461.3,41056.9,14.05,100.48333
4,พระนครศรีอยุธยา,12918.0,14904.0,13319.0,14980.0,19676.0,21676.0,25820.0,22301.7,26481.5,28379.4,14.35361,100.56917


### 5. จัดรูปข้อมูลใหม่ ก่อนเข้า plotly

In [7]:
years = [2541, 2543, 2545, 2547, 2549, 2550, 2552, 2554, 2556, 2558]
years = [str(y) for y in years]

prov = []
yr   = []
inc  = []
lat, lon = [], []
for row in range(df_all.shape[0]):
    for y in years:
        prov.append(df_all.iloc[row]['ภาค และจังหวัด'])
        yr.append(y)
        lat.append(df_all.iloc[row]['lat'])
        lon.append(df_all.iloc[row]['lon'])
        inc.append(df_all.iloc[row][y])
        
df_all_1 = pd.DataFrame({'prov':prov,'lat':lat,'lon':lon,'inc':inc},index=yr)
df_all_1.head(10)

Unnamed: 0,inc,lat,lon,prov
2541,26054.0,13.72917,100.52389,กรุงเทพมหานคร
2543,26909.0,13.72917,100.52389,กรุงเทพมหานคร
2545,29589.0,13.72917,100.52389,กรุงเทพมหานคร
2547,29843.0,13.72917,100.52389,กรุงเทพมหานคร
2549,36658.0,13.72917,100.52389,กรุงเทพมหานคร
2550,39020.0,13.72917,100.52389,กรุงเทพมหานคร
2552,42380.0,13.72917,100.52389,กรุงเทพมหานคร
2554,48951.0,13.72917,100.52389,กรุงเทพมหานคร
2556,49190.8,13.72917,100.52389,กรุงเทพมหานคร
2558,45571.7,13.72917,100.52389,กรุงเทพมหานคร


### 6. สร้าง scatter plot รายได้แต่ละจังหวัด ในแต่ละปี บนแผนที่ด้วย plotly

สร้าง list ของแต่ละ legend (ใช้่ช่วงรายได้) และ แต่ละปี

In [8]:
years = [str(year) for year in [2541,2543,2545,2547,2549,2550,2552,2554,2556,2558]]

bins = pd.IntervalIndex.from_tuples([(0,1e4), (1e4, 2e4), (2e4,3e4),(3e4,4e4),(4e4,5e4)])

def get_income_range(x):
    for income_bin in bins:
        if x in income_bin:
            return '{} - {}'.format(int(income_bin.left), int(income_bin.right))

df_all_1['income_range'] = df_all_1['inc'].apply(get_income_range)

income_range = sorted(df_all_1['income_range'].unique())
income_range

['0 - 10000',
 '10000 - 20000',
 '20000 - 30000',
 '30000 - 40000',
 '40000 - 50000']

In [9]:
df_all_1.head()

Unnamed: 0,inc,lat,lon,prov,income_range
2541,26054.0,13.72917,100.52389,กรุงเทพมหานคร,20000 - 30000
2543,26909.0,13.72917,100.52389,กรุงเทพมหานคร,20000 - 30000
2545,29589.0,13.72917,100.52389,กรุงเทพมหานคร,20000 - 30000
2547,29843.0,13.72917,100.52389,กรุงเทพมหานคร,20000 - 30000
2549,36658.0,13.72917,100.52389,กรุงเทพมหานคร,30000 - 40000


### Visualization

![](docs/household-income.gif) 

In [10]:
# make figure
figure = {
    'data': [],
    'layout': {},
    'frames': []
}

# fill in most of layout
figure['layout']['title'] = "Thailand's income by province"
figure['layout']['hovermode'] = 'closest'
figure['layout']['geo'] = dict(
    scope = 'asia',
    showland = True,
    landcolor = 'rgb(217, 217, 217)',
    subunitwidth=1,
    countrywidth=1,
    subunitcolor="rgb(255, 255, 255)",
    countrycolor="rgb(255, 255, 255)",
    lonaxis = dict( range= [ 90.0, 110.0 ] ),
    lataxis = dict( range= [ 5.0, 21.0 ] ),
)
figure['layout']['sliders'] = {
    'args': [
        'transition', {
            'duration': 400,
            'easing': 'cubic-in-out'
        }
    ],
    'initialValue': '2541',
    'plotlycommand': 'animate',
    'values': years,
    'visible': True
}
figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 500, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'quadratic-in-out'}}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Year:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration': 300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0,
    'steps': []
}

# make data
year = '2541'
for ra in income_range:
    dataset_by_year = df_all_1.loc[year]
    dataset_by_year_and_ra = dataset_by_year[dataset_by_year['income_range'] == ra]

    data_dict = {
        'type':'scattergeo',
        'locationmode' : 'country names',
        'lon': list(dataset_by_year_and_ra['lon']),
        'lat': list(dataset_by_year_and_ra['lat']),
        'mode': 'markers',
        'text': list(dataset_by_year_and_ra['prov']),
        'marker': {
            'sizemode': 'area',
            'size': list(dataset_by_year_and_ra['inc']/1e3*3)
        },
        'name': ra
    }
    figure['data'].append(data_dict)

# make frames
for year in years:
    frame = {'data': [], 'name': str(year)}
    for ra in income_range:
        dataset_by_year = df_all_1.loc[year]
        dataset_by_year_and_ra = dataset_by_year[dataset_by_year['income_range'] == ra]

        data_dict = {
            'type':'scattergeo',
            'lon': list(dataset_by_year_and_ra['lon']),
            'lat': list(dataset_by_year_and_ra['lat']),
            'mode': 'markers',
            'text': list(dataset_by_year_and_ra['prov']),
            'marker': {
                'sizemode': 'area',
                'size': list(dataset_by_year_and_ra['inc']/1e3*3)
            },
            'name': ra
        }
        frame['data'].append(data_dict)

    figure['frames'].append(frame)
    
    slider_step = {'args': [
        [year],
        {'frame': {'duration': 300, 'redraw': False},
         'mode': 'immediate',
       'transition': {'duration': 300}}
     ],
     'label': year,
     'method': 'animate'}
    sliders_dict['steps'].append(slider_step)

    
figure['layout']['sliders'] = [sliders_dict]

iplot(figure)

### 7. ลอง visualize mean & standard deviation ของรายได้แต่ละปี

In [11]:
mean = []
std = []
for year in years:
    mean.append(np.mean(df_all_1.loc[year]['inc']))
    std.append(np.std(df_all_1.loc[year]['inc']))

df = pd.DataFrame({'year':years,'mean income':mean,'std dev':std})

![](docs/household-income-3.png) 

In [12]:
trace1 = go.Bar(
    x=df['year'],
    y=df['mean income'],
#     name='Control',
    error_y=dict(
        type='data',
        array=df['std dev'],
        visible=True
    )
)
data = [trace1]
layout = go.Layout(
    title = "Thailand's mean income by year",
    barmode='group'
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='error-bar-bar')