## Setup

In [6]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd 
import bamboolib as bam 
import plotly.express as px 
from scipy.stats import ttest_ind
DATA_PATH = 'dataset/clean/poverty_analysis.xlsx'




## Read Data

In [2]:
poverty_analysis_data = pd.read_excel(DATA_PATH)

In [3]:
poverty_analysis_data

Unnamed: 0,PROVINSI,unemployment_avg_2022,growth_ctz_20_22,poverty_pct_avg_2022,REGIONAL,ZONE
0,ACEH,6.07,1.43293,14.695,SUMATERA,WEST
1,SUMATERA UTARA,5.815,1.21401,8.375,SUMATERA,WEST
2,SUMATERA BARAT,6.225,1.091596,5.98,SUMATERA,WEST
3,RIAU,4.385,1.954455,6.81,SUMATERA,WEST
4,JAMBI,4.645,1.32859,7.66,SUMATERA,WEST
5,SUMATERA SELATAN,4.685,1.273289,11.925,SUMATERA,WEST
6,BENGKULU,3.49,1.397251,14.48,SUMATERA,WEST
7,LAMPUNG,4.415,1.065908,11.505,SUMATERA,WEST
8,KEP. BANGKA BELITUNG,4.475,1.52006,4.53,SUMATERA,WEST
9,KEP. RIAU,8.125,3.152867,6.135,SUMATERA,WEST


## Exploratory (First) Data Analysis

### Unemployment

In [10]:
poverty_analysis_data.columns

Index(['PROVINSI', 'unemployment_avg_2022', 'growth_ctz_20_22',
       'poverty_pct_avg_2022', 'REGIONAL', 'ZONE'],
      dtype='object')

In [24]:
#create scatter --> 
avg_unemployment_hist = px.histogram(poverty_analysis_data, 
                                     x="unemployment_avg_2022",
                                    template='seaborn')
avg_unemployment_hist.update_layout(title='Histogram of Average Unemployment from 34 Province in 2022')

In global level the unemployment rate in 2022 centered around 4 to 5 %

In [25]:
#create scatter --> 
avg_unemployment_zone= px.histogram(poverty_analysis_data, x="unemployment_avg_2022",
                                 facet_col='ZONE',color='ZONE',
                                   template='seaborn')
avg_unemployment_zone.update_layout(title='Average Unemployment in 2022 in 3 Zones')

However, if we conduct zoning the unemployment rate in west has higher center in 4 to 5% while others in 3 to 4%

### Population Growth

In [27]:
#create scatter --> 
pop_growth = px.histogram(poverty_analysis_data, 
                                     x="growth_ctz_20_22",
                                    template='seaborn')
pop_growth.update_layout(title='Histogram of Population Growth  from 34 Province in 2020 to 2022')

In global level the population growth  in 2022 centered around 1 to 2%

In [28]:
#create scatter --> 
pop_growth_zone = px.histogram(poverty_analysis_data, x="growth_ctz_20_22",
                                 facet_col='ZONE',color='ZONE',
                                   template='seaborn')
pop_growth_zone.update_layout(title='Population Growth  in 2020 to 2022 in 3 Zones')

   In three zones we observe only west zone centers around 1 to 1.5% the other in 1.5 to 2%

### Poverty Percentage

In [29]:
#create scatter --> 
poverty_ = px.histogram(poverty_analysis_data, 
                                     x="poverty_pct_avg_2022",
                                    template='seaborn')
poverty_.update_layout(title='Histogram of Poverty Percent from 34 Province in 2020 to 2022')

for 34 provinces the poverty percent mostly in 5 to 15 percent

In [30]:
#create scatter --> 
poverty_zone = px.histogram(poverty_analysis_data, x="poverty_pct_avg_2022",
                                 facet_col='ZONE',color='ZONE',
                                   template='seaborn')
poverty_zone.update_layout(title='Poverty Percent in  2022 in 3 Zones')

by zoning we have clear glimpse, west zone prone to center at 5 to 10 percent poverty , while middle zone is in 10 to 15% , and the east seems to offset quite far by having tail up to 25-30% poverty

## Statistical Test
Goal : Compare Mean of Two Samples (Different Zones) , to check if given mean has difference

sample size = 5 province from each region 



alpha = 0.05 



In [10]:
## Divide data into west middle and east 

west = poverty_analysis_data.loc[poverty_analysis_data['ZONE']=='WEST']
middle = poverty_analysis_data.loc[poverty_analysis_data['ZONE']=='MIDDLE']
east = poverty_analysis_data.loc[poverty_analysis_data['ZONE']=='EAST']


In [19]:
east.sample(5).PROVINSI.unique()

array(['SULAWESI SELATAN', 'GORONTALO', 'PAPUA', 'SULAWESI UTARA',
       'SULAWESI TENGAH'], dtype=object)

In [20]:
west.sample(5).PROVINSI.unique()

array(['KALIMANTAN SELATAN', 'SUMATERA BARAT', 'KALIMANTAN TIMUR',
       'KEP. BANGKA BELITUNG', 'BENGKULU'], dtype=object)

In [21]:
middle.sample(5).PROVINSI.unique()

array(['JAWA TENGAH', 'DKI JAKARTA', 'JAWA BARAT', 'BANTEN',
       'NUSA TENGGARA TIMUR'], dtype=object)

### Draw Sample from Each Region

In [23]:
sample_east = ['SULAWESI SELATAN', 'GORONTALO', 'PAPUA', 'SULAWESI UTARA',
       'SULAWESI TENGAH']
sample_west = ['KALIMANTAN SELATAN', 'SUMATERA BARAT', 'KALIMANTAN TIMUR',
       'KEP. BANGKA BELITUNG', 'BENGKULU']
sample_middle = ['JAWA TENGAH', 'DKI JAKARTA', 'JAWA BARAT', 'BANTEN',
       'NUSA TENGGARA TIMUR']

In [24]:
#calculate each mean 
west_filter = poverty_analysis_data['PROVINSI'].isin(sample_west)
east_filter = poverty_analysis_data['PROVINSI'].isin(sample_east)
middle_filter = poverty_analysis_data['PROVINSI'].isin(sample_middle)
west_mean = poverty_analysis_data.loc[west_filter]
east_mean = poverty_analysis_data.loc[east_filter]
middle_mean = poverty_analysis_data.loc[middle_filter]

### 1. West Indonesia vs East Indonesia 

Hypothesis : 
$$ H_0 : \mu_{west} = \mu_{east}$$
$$ H_1 : \mu_{west} \neq  \mu_{east}$$

In [32]:
col = 'growth_ctz_20_22'
result_west_east = ttest_ind(west_mean[col], east_mean[col],alternative='two-sided',
                random_state=20)

result_west_east

Ttest_indResult(statistic=1.30408781597145, pvalue=0.22847234226006846)

In [30]:
west_mean[col].mean()

1.3874343349690355

Hypothesis : 
$$ H_0 : \mu_{a} = \mu_{b}$$
$$ H_1 : \mu_{a} \neq  \mu_{b}$$

### 2. Middle Indonesia vs East Indonesia 

Hypothesis : 
$$ H_0 : \mu_{middle} = \mu_{east}$$
$$ H_1 : \mu_{middle} \neq  \mu_{east}$$

In [33]:
result_middle_east = ttest_ind(middle_mean[col], east_mean[col],alternative='two-sided',
                random_state=20)

result_middle_east

Ttest_indResult(statistic=0.0733200151548432, pvalue=0.9433515378732877)

### 3. West Indonesia vs Middle Indonesia

Hypothesis : 
$$ H_0 : \mu_{west} = \mu_{middle}$$
$$ H_1 : \mu_{west} \neq  \mu_{middle}$$



In [34]:
result_west_middle = ttest_ind(middle_mean[col], east_mean[col],alternative='two-sided',
                random_state=20)

result_middle_east

Ttest_indResult(statistic=0.0733200151548432, pvalue=0.9433515378732877)

## 4 Regression

In [2]:
#import ols for regression 
import statsmodels.api as sm

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\Fakhri Robi Aulia\AppData\Roaming\Python\Python38\site-packages\IPython\core\interactiveshell.py", line 3398, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\Fakhri Robi Aulia\AppData\Local\Temp\ipykernel_4180\3463688347.py", line 2, in <cell line: 2>
    import statsmodels.api as sm
  File "c:\users\fakhri robi aulia\appdata\local\programs\python\python38\lib\site-packages\statsmodels\api.py", line 73, in <module>
    from .__init__ import test
  File "c:\users\fakhri robi aulia\appdata\local\programs\python\python38\lib\site-packages\statsmodels\__init__.py", line 1, in <module>
    from statsmodels._version import __version__, __version_tuple__
ImportError: cannot import name '__version__' from 'statsmodels._version' (c:\users\fakhri robi aulia\appdata\local\programs\python\python38\lib\site-packages\statsmodels\_version.py)

During handling of the above exception, another exception occurred:

Tra

### Single Model

#### using unemployment_avg_2022 as predictor

In [4]:
poverty_analysis_data

Unnamed: 0,PROVINSI,unemployment_avg_2022,growth_ctz_20_22,poverty_pct_avg_2022,REGIONAL,ZONE
0,ACEH,6.07,1.43293,14.695,SUMATERA,WEST
1,SUMATERA UTARA,5.815,1.21401,8.375,SUMATERA,WEST
2,SUMATERA BARAT,6.225,1.091596,5.98,SUMATERA,WEST
3,RIAU,4.385,1.954455,6.81,SUMATERA,WEST
4,JAMBI,4.645,1.32859,7.66,SUMATERA,WEST
5,SUMATERA SELATAN,4.685,1.273289,11.925,SUMATERA,WEST
6,BENGKULU,3.49,1.397251,14.48,SUMATERA,WEST
7,LAMPUNG,4.415,1.065908,11.505,SUMATERA,WEST
8,KEP. BANGKA BELITUNG,4.475,1.52006,4.53,SUMATERA,WEST
9,KEP. RIAU,8.125,3.152867,6.135,SUMATERA,WEST
