# EDTA kernel-costa rica poverty



![title](img/costa.jpg)

Many social programs have a hard time making sure the right people are given enough aid. It抯 especially tricky when a program focuses on the poorest segment of the population. The world抯 poorest typically can抰 provide the necessary income and expense records to prove that they qualify. Beyond Costa Rica, many countries face this same problem of inaccurately assessing social need.

A popular algorithm called the Proxy Means Test (or PMT) is used to verify income qualification along with family抯 observable household attributes like the material of their walls and ceiling, or the assets found in the home to classify them and predict their level of need. This is an improvement but accuracy remains a problem as the region抯 population grows and poverty declines.

In [None]:
import pandas as pd
import seaborn as sns
import numpy
import missingno as msno
import matplotlib.pyplot as plt
import squarify
import plotly.offline as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

from plotly import tools

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [None]:
train=pd.read_csv("../input/train.csv")
test=pd.read_csv("../input/test.csv")
train=train.copy()
test=test.copy()


In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.shape


The dataset contains a large set of features (143).We must do some feature engineering to acheive our
task.

# Missing values

In [None]:
missing=pd.DataFrame(100*train.isnull().sum()/len(train),columns=["percent of missing values"]).sort_values(ascending=False,by
                                                                                                       ="percent of missing values")
plt.figure(figsize=(7,6))
plt.style.use("fivethirtyeight")
sns.barplot(x=missing.index[:3],y=missing["percent of missing values"][:3])
plt.show()

We will inspect the columns.                    
rez_esc indicates the number of years behind in school.                   

v18q indicates if he/she owns a tablet.So Nan here means 0.


In [None]:
len(train[train['escolari'].notnull()])
#len(train[train['rez_esc'].isnull()])

Now for v2a1 which indicates the rent of their house.
The emptied colmns might mean that they own a house and has no rent.We will check anyway.

In [None]:
print("Number of missing values in v2a1 is ",train['v2a1'].notnull().sum())

In [None]:
print("Number of owned and precarious houses is",len( train[(train['tipovivi3']==1) | (train['tipovivi2']==1)]))

So here filling those positions with 0 would be accurate as there
is no rent for own houses.

In [None]:
train['v2a1']=train['v2a1'].fillna(0)
train['v18q1']=train['v18q1'].fillna(0)
train['rez_esc']=train['rez_esc'].fillna(0)


# Visualization

In [None]:
Target=train.groupby('Target')['Id'].agg('count')
x1=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households ']
trace1=go.Bar(x=x1,y=Target)
layout=go.Layout(title="Population",yaxis=dict(title='Number of people'))
fig=go.Figure(data=[trace1],layout=layout)
py.iplot(fig)

# What do they own?

In [None]:
level=train.groupby('Target')[['refrig','v18q','television','mobilephone']].agg('sum')
level=level.div(Target,axis="rows")*100


trace2=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=level['refrig'],
              name='refrigerator')

trace5=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=level['v18q'],
              name='tablets')


trace4=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=level['television'],
              name='Television')


trace3=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=level['mobilephone'],
              name='mobiles')



data=[trace2,trace3,trace4,trace5]
layout=go.Layout(barmode='stack',yaxis=dict(title="percent of users"),title="what do they own")

fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
%matplotlib notebook
avg=train.groupby('Target')[['hogar_total','r4h3','r4m3','hogar_adul']].agg("mean")
fig,axes=plt.subplots(nrows=2,ncols=2,sharex=True)
sns.barplot(x=avg.index,
           y=avg['hogar_total'],ax=axes[0][0])


sns.barplot(x=avg.index,
           y=avg['r4h3'],ax=axes[0][1])

sns.barplot(x=avg.index,
           y=avg['r4m3'],ax=axes[1][0])

sns.barplot(x=avg.index,
           y=avg['hogar_adul'],ax=axes[1][1])

plt.show()

In [None]:
per_12=train.groupby('Target')[['r4h2','r4m2','r4t2','r4t1']].agg('sum')

trace3=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=per_12['r4h2'],name="Males 12 years of age and older")

trace2=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households ']
              ,y=per_12['r4m2'],name="Females 12 years of age and older")


trace1=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=per_12['r4t2'],name="persons 12 years of age and older")

trace4=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=per_12['r4t1'],name=" persons younger than 12 years of age")


data=[trace1,trace2,trace3,trace4]
layout=go.Layout(barmode="stack",yaxis=dict(title="number of people"))

fig=go.Figure(layout=layout,data=data)
py.iplot(fig)

In [None]:
elec=train.groupby("Target")[['abastaguadentro','abastaguafuera','abastaguano']].agg('sum')
elec=elec.div(Target,axis="rows")*100

trace1=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=elec['abastaguadentro'],name="water provision inside dwelling")

trace2=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=elec['abastaguafuera'],name='water provision outside dwelling')

trace3=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=elec['abastaguano'],name='no water provision')

data=[trace1,trace2,trace3]

layout=go.Layout(barmode="stack",title="water provision",yaxis=dict(title="percent of users"))

fig=go.Figure(data=data,layout=layout)
py.iplot(fig)



## How are their bathrooms?

In [None]:
toilet=train.groupby('Target')[['sanitario1','sanitario2','sanitario3','sanitario6','sanitario5']].agg(sum)
toilet=toilet.div(Target,axis='rows')*100

trace2=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=toilet.iloc[:,0],name="no toilet")
trace3=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=toilet.iloc[:,1],name="toilet connected to sewer")
trace4=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
           y=toilet.iloc[:,2],name="toilet connected to  septic tank")
trace5=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=toilet.iloc[:,3],name="toilet connected to other system")
trace6=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=toilet.iloc[:,4],name="toilet connected to black hole")


bathroom=train.groupby("Target")['v14a'].agg("sum").div(Target,axis="rows")*100


trace1=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
              y=bathroom,
              name='bathroom')

fig=tools.make_subplots(rows=1,cols=2,subplot_titles=['percent of people having bathrooms','Bathroom condition'])

fig.append_trace(trace1,1,1)
fig.append_trace(trace2,1,2)
fig.append_trace(trace3,1,2)
fig.append_trace(trace4,1,2)
fig.append_trace(trace5,1,2)
fig.append_trace(trace6,1,2)


fig['layout'].update(height=500, title="Bathrooms", barmode="stack", showlegend=True)
py.iplot(fig)

# How do they cook?

In [None]:
energy=train.groupby("Target")[["energcocinar1","energcocinar2","energcocinar3","energcocinar4"]].agg("sum")
energy=energy.div(Target,axis="rows")*100

trace2=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=energy.iloc[:,1],name="electricity")
trace1=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
           y=energy.iloc[:,2],name="gas")
trace3=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=energy.iloc[:,3],name="charcol")
trace4=go.Bar(x=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households '],
             y=energy.iloc[:,0],name="no kitchen")
data=[trace1,trace2,trace3,trace4]
layout=go.Layout(barmode="stack",yaxis=dict(title="percent of users"),title="cooking food")
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

In [None]:
extreme=train[train['Target']==1]['tamviv']
moderate=train[train['Target']==2]['tamviv']
vulnerable=train[train['Target']==3]['tamviv']
non_vulnerable=train[train['Target']==4]['tamviv']
trace1=go.Box(y=extreme,name="extreme poverty")

trace2=go.Box(y=moderate,name="moderate poverty")

trace3=go.Box(y=vulnerable,name='vulnerable poverty')

trace4=go.Box(y=non_vulnerable,name="non_vulnerable poverty")

layout = go.Layout(
    title = "Box Plot of number of persons living in the household",yaxis=dict(title="number of persons"))


data=[trace1,trace2,trace3,trace4]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

# Which electricity provider do they prefer?

In [None]:
electricity=train.groupby('Target')[['public','planpri','noelec','coopele']].agg("sum")
electricity=electricity.div(Target,axis="rows")*100

x1=['extreme poverty','moderate poverty','vulnerable households ','non vulnerable households ']

trace1=go.Bar(x=x1,y=electricity['public'],name="public sector")

trace2=go.Bar(x=x1,y=electricity['planpri'],name="electricity from private plant")

trace3=go.Bar(x=x1,y=electricity['noelec'],name="no electricity in the dwelling")

trace4=go.Bar(x=x1,y=electricity['coopele'],name="electricity from cooperative")


data=[trace1,trace2,trace3,trace4]
layout=go.Layout(barmode="stack",title="Electricity",yaxis=dict(title="percent of users"))
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

# What are the conditions of their houses?

In [None]:
trace=[]
def plot_bar(df,mylist=[]):
    trace1=go.Bar(x=x1,y=df.iloc[:,0],name=mylist[0])
    trace2=go.Bar(x=x1,y=df.iloc[:,1],name=mylist[1])
    trace3=go.Bar(x=x1,y=df.iloc[:,2],name=mylist[2])

    return  trace1,trace2,trace3


trace1,trace2,trace3=plot_bar(electricity,['public','planpri','noelec'])

In [None]:
house=train.groupby('Target')[['tipovivi1','tipovivi2','tipovivi3']].agg("sum")
house=house.div(Target,axis="rows")*100
tr1,tr2,tr3=plot_bar(house,['fully owned','installement paying','rented'])

#walls
walls=train.groupby('Target')[['epared1','epared2','epared3']].agg("sum").div(Target,axis="rows")*100
tr4,tr5,tr6=plot_bar(walls,['bad','regular','good'])

#roofs
roofs=train.groupby("Target")[['etecho1','etecho2',"etecho3"]].agg("sum").div(Target,axis="rows")*100
tr7,tr8,tr9=plot_bar(roofs,['bad','regular','good'])

#floor
floor=train.groupby("Target")[['eviv1','eviv2','eviv3']].agg("sum").div(Target,axis="rows")*100
tr10,tr11,tr12=plot_bar(floor,['bad','regular','good'])

titles=['house','walls','roofs','floor']

fig = tools.make_subplots(rows=2, cols=2, print_grid=False,subplot_titles=titles)



fig.append_trace(tr1,1,1)
fig.append_trace(tr2,1,1)
fig.append_trace(tr3,1,1)

fig.append_trace(tr4,1,2)
fig.append_trace(tr5,1,2)
fig.append_trace(tr6,1,2)

fig.append_trace(tr7,2,1)
fig.append_trace(tr8,2,1)
fig.append_trace(tr9,2,1)

fig.append_trace(tr10,2,2)
fig.append_trace(tr11,2,2)
fig.append_trace(tr12,2,2)



fig['layout'].update(height=1000, title="Condition of houses", barmode="stack", showlegend=True)
py.iplot(fig)

# Gender distribution


In [None]:
#def plot_pie():
    

    
    

pie=train.groupby('Target')[['male','female']].agg('sum').div(Target,axis="rows")*100

labels=["Male","Female"]

fig = {
    'data': [
        {
            'labels': labels,
            'values': pie.iloc[0,:],
            'type': 'pie',
            'name': 'extereme poverty',
        'domain': {'x': [0, .48],
                       'y': [0, .49]}
        }
     ,
        {
            'labels': labels,
            'values': pie.iloc[1,:],
            'type': 'pie',
            'name': 'mderate poverty',
          'domain': {'x': [.52, 1],
                       'y': [0, .49]}},
        {
            'labels': labels,
            'values': pie.iloc[2,:],
            'type': 'pie',
            'name': 'vulnerable poverty',
             'domain': {'x': [0, .48],
                       'y': [.51, 1]}
        },
        {
            'labels': labels,
            'values': pie.iloc[3,:],
            'type': 'pie',
            'name':'Non vulnerable poverty',
              'domain': {'x': [.52, 1],
                       'y': [.51, 1]}
            
        }
    ],
    
    'layout': {'title': 'Male to Female ratio',
               'showlegend': False}
}

py.iplot(fig, filename='pie_chart_subplots')


# overcrowding in houses


In [None]:
extreme=train[train['Target']==1]['overcrowding']
moderate=train[train['Target']==2]['overcrowding']
vulnerable=train[train['Target']==3]['overcrowding']
non_vulnerable=train[train['Target']==4]['overcrowding']

trace1=go.Box(y=extreme,name="extreme poverty")

trace2=go.Box(y=moderate,name="moderate poverty")

trace3=go.Box(y=vulnerable,name='vulnerable poverty')

trace4=go.Box(y=non_vulnerable,name="non_vulnerable poverty")

layout = go.Layout(
    title = "Box Plot of overcrowding",yaxis=dict(title="number of persons"))


data=[trace1,trace2,trace3,trace4]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)


In [None]:
extreme=train[train['Target']==1]['bedrooms']
moderate=train[train['Target']==2]['bedrooms']
vulnerable=train[train['Target']==3]['bedrooms']
non_vulnerable=train[train['Target']==4]['bedrooms']

trace1=go.Box(y=extreme,name="extreme poverty")

trace2=go.Box(y=moderate,name="moderate poverty")

trace3=go.Box(y=vulnerable,name='vulnerable poverty')

trace4=go.Box(y=non_vulnerable,name="non_vulnerable poverty")

layout = go.Layout(
    title = "Box Plot of number of bedrooms",yaxis=dict(title="number of bedrooms"))


data=[trace1,trace2,trace3,trace4]
fig=go.Figure(data=data,layout=layout)
py.iplot(fig)


# Where is more poverty ? urban or rural 

In [None]:
pie=train.groupby('Target')[['area1',"area2"]].agg('sum').div(Target,axis="rows")*100



labels=["urban","rural"]

fig = {
    'data': [
        {
            'labels': labels,
            'values': pie.iloc[0,:],
            'type': 'pie',
            'name': 'extereme poverty',
        'domain': {'x': [0, .48],
                       'y': [0, .49]}
        }
     ,
        {
            'labels': labels,
            'values': pie.iloc[1,:],
            'type': 'pie',
            'name': 'mderate poverty',
          'domain': {'x': [.52, 1],
                       'y': [0, .49]}},
        {
            'labels': labels,
            'values': pie.iloc[2,:],
            'type': 'pie',
            'name': 'vulnerable poverty',
             'domain': {'x': [0, .48],
                       'y': [.51, 1]}
        },
        {
            'labels': labels,
            'values': pie.iloc[3,:],
            'type': 'pie',
            'name':'Non vulnerable poverty',
              'domain': {'x': [.52, 1],
                       'y': [.51, 1]}
            
        }
    ],
    
    'layout': {'title': 'Urban- rural poverty ratio',
               'showlegend': False}
}

py.iplot(fig, filename='pie_chart_subplots')

# education

In [None]:
education=train.groupby('Target')[['instlevel1','instlevel2','instlevel3',
                                   'instlevel4','instlevel5','instlevel6',
                                  'instlevel7','instlevel8','instlevel9']].agg('sum').div(Target,axis="rows")*100

In [None]:
trace0=go.Bar(x=education.columns,y=education.iloc[0,:],name="education")
trace1=go.Bar(x=education.columns,y=education.iloc[1,:],name="education")
trace2=go.Bar(x=education.columns,y=education.iloc[2,:],name="education")
trace3=go.Bar(x=education.columns,y=education.iloc[3,:],name="education")

fig=tools.make_subplots(rows=2,cols=2,subplot_titles=x1)

fig.append_trace(trace0,1,1)
fig.append_trace(trace1,1,2)
fig.append_trace(trace2,2,1)
fig.append_trace(trace3,2,2)

fig['layout'].update(height=1000, title="Education levels", showlegend=False)
py.iplot(fig)

In [None]:
extreme=train[train['Target']==1]['escolari']
moderate=train[train['Target']==2]['escolari']
vulnerable=train[train['Target']==3]['escolari']
non_vulnerable=train[train['Target']==4]['escolari']


trace1=go.Box(y=extreme,name='extreme')

trace2=go.Box(y=moderate,name='moderate')
trace3=go.Box(y=vulnerable,name='vulnerable')
trace4=go.Box(y=non_vulnerable,name='non_vulnerable')

data=[trace1,trace2,trace3,trace4]


layout = go.Layout(
    title = "Box Plot of years of education",yaxis=dict(title="number of years in school"))

fig=go.Figure(data=data,layout=layout)
py.iplot(fig)

# predictions coming soon..