In [1]:
import pandas as pd
import numpy as np
import math

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import plotly.figure_factory as ff

from IPython.display import display, HTML

init_notebook_mode(connected=True)


# <center> Title </center>
### <center>*Subtitle*</center>

In [2]:
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
$( document ).ready(code_toggle);
</script>
The raw code for this IPython notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')


In [3]:
# Some general functions

# Get ocurrences of each value for a multiple_select field
def get_ocurrences(df, column_name):
    values_sums = df.groupby(column_name).size()
    values_names = reduce(lambda x, y: x + y,map(lambda x: x.split(), values_sums.index.values))
    result = pd.Series()
    for i in values_names:
        result[i] = values_sums.loc[[i in idx for idx in values_sums.index]].sum()
    return result

def get_percentiles_values(sorted_serie, percentiles):
    if len(percentiles) == 0:
        return []
    else:
        quantity = sorted_serie.sum() * percentiles[0]
        for i in sorted_serie.index:
            if sorted_serie[i] >= quantity:
                return [i] + get_percentiles_values(sorted_serie, percentiles[1:])
            else:
                quantity -= sorted_serie[i]

def get_percentiles(sorted_serie, values):
    if len(values) == 0:
        return []
    else:
        result = 0
        for i in sorted_serie.index:
            if values[0] == i:
                return [float(result + sorted_serie[i]) / sorted_serie.sum()] + get_percentiles(sorted_serie, values[1:])
            elif values[0] < i:
                return [float(result) / sorted_serie.sum()] + get_percentiles(sorted_serie, values[1:])
            else:
                result += sorted_serie[i]
        return [1]

## Identify business problem

Brief explanation about the business problem

## Data acquisition

In [1]:
def read_lines_file(name):
    fo = open(name, 'r')
    lines = fo.readlines()
    fo.close()
    return lines

# lines = read_lines_file("file.txt")
# reg_survey_complete = pd.read_excel("file.xlsx", encoding = 'utf_8')

## Process / Clean data

In [4]:
## Formatting data

# Get column names
# col_names = reg_survey_complete.columns

# Delete part of the column names
# col_names = col_names.map(lambda x: x[11:] if x.startswith("demography:") else x)

# Replace ':' with '_' for compatibility reasons.
# col_names = col_names.map(lambda x: x[:14] + '_' + x[15:] if x.startswith("geocoordinates:") else x)

# Change names
# col_names = col_names.map(lambda x: 'livestock_lost_goat' if x == 'livestock_lost_goats' else x)

# print col_names.values
# reg_survey_complete.columns = col_names

In [6]:
## Missing data handling

# Processing NaN and void values
# values = {
#     'respondant_marital_status': 'no_answer',
#     'handicapped_member': 'no',
#     'cc_effects_sand_affected_farmlands': 0
# }
# reg_survey_complete.fillna(value=values, inplace=True)

In [3]:
## Data transformation

# Name of villages by dictionary
# villages = reg_survey_complete['village'].unique()
# villages_table = pd.read_excel("villages_northern.xlsx")
# villages_dict = dict()
# villages_dict['ar'] = villages_table.set_index('survey_code').to_dict()['Arabic name']
# villages_dict['en'] = villages_table.set_index('survey_code').to_dict()['English transcription']
# reg_survey_complete['village'] = reg_survey_complete.village.map(villages_dict['en'])

# Creating subsets
# reg_survey_male = reg_survey.loc[reg_survey['respondant_gender'].str.match('male')].copy()
# reg_survey_female = reg_survey.loc[reg_survey['respondant_gender'].str.match('female')].copy()

# Creating new columns
# reg_survey['num_lh_strats'] = reg_survey['src_incomes'].map(lambda x: len(x.split()))

## Exploratory analysis

To understand the data we use graphical techniques (Scatter plots, box plots, histograms) and quantitative techniques (Mean, median, Mode, Standard deviation)

In [4]:
## Quantitative techniques

# print reg_survey['deviceid'].count()
# surveys_village = reg_survey.groupby('village').size()
# mean_hh_size = reg_survey.family_total.mean()
# kitchen_type = get_ocurrences(reg_survey, 'kitchen_type').sort_values(ascending = False)
# print src_incomes_sums.sort_values(ascending = False)

# Create and show table
# respondants_gender = pd.DataFrame(surveys_village, columns= ['Total'])
# respondants_gender = respondants_gender.reindex(['Total'], axis=1)
# respondants_gender.loc['Total', 'Total'] = respondants_gender['Total'].sum()
# respondants_gender['Total'] = respondants_gender['Total'].astype(int)
# display(respondants_gender)


In [13]:
## Graphical techniques

def create_graphs_HoHH(data_func, values_func, labels_func, sum_func, title):
    create_three_pie_graphs((reg_survey_femHoHH, reg_survey, reg_survey_maleHoHH), data_func, values_func,
                            labels_func, sum_func, title,
                            "Female Head of Household", .04,
                            "Global", .5,
                            "Male Head of Household", .94)

def create_graphs_respondent(data_func, values_func, labels_func, sum_func, title):
    create_three_pie_graphs((reg_survey_female, reg_survey, reg_survey_male), data_func, values_func,
                            labels_func, sum_func, title,
                            "Female Respondent", .07,
                            "Global", .5,
                            "Male Respondent", .91)

def create_pie_graph(serie, title):
    fig = {
        "layout": {
            "title": title
        },
        "data": [
            {
                "values": serie.values,
                "labels": serie.index.values,
                "type": "pie",
                "sort": False
            }
        ]
    }

    iplot(fig)
    
    
def create_three_pie_graphs((df1, df2, df3), data_func, values_func,
                            labels_func, sum_func, title,
                            title1, title1_x_offset,
                            title2, title2_x_offset,
                            title3, title3_x_offset):
    data_fem = data_func(df1)
    data_global = data_func(df2)
    data_male = data_func(df3)

    fig = {
        "layout": {
            "title": title,
            "annotations": [
                {
                    "font": {
                        "size": 14
                    },
                    "showarrow": False,
                    #"text": title1 + " (" + str(sum_func(data_fem)) + ")",
                    "text": title1,
                    "x": title1_x_offset,
                    "y": 1
                },
                {
                    "font": {
                        "size": 14
                    },
                    "showarrow": False,
                    #"text": title2 + " (" + str(sum_func(data_global)) + ")",
                    "text": title2,
                    "x": title2_x_offset,
                    "y": 1
                },
                {
                    "font": {
                        "size": 14
                    },
                    "showarrow": False,
                    #"text": title3 + " (" + str(sum_func(data_male)) + ")",
                    "text": title3,
                    "x": title3_x_offset,
                    "y": 1
                }
            ]
        },
        "data": [
            {
                "values": values_func(data_fem),
                "labels": labels_func(data_fem),
                "domain": {
                    "x": [0, .32]
                },
                "type": "pie",
                "sort": False
            },
            {
                "values": values_func(data_global),
                "labels": labels_func(data_global),
                "domain": {
                    "x": [.34, .64]
                },
                "type": "pie",
                "sort": False
            },
            {
                "values": values_func(data_male),
                "labels": labels_func(data_male),
                "domain": {
                    "x": [.66, .98]
                },
                "type": "pie",
                "sort": False
            }
        ]
    }

    iplot(fig)

def create_stacked_bars_graph((df1, df2), data_func, x_func, y_func, title,
                              title_df1, title_df2):
    data_global = data_func(pd.concat([df1, df2]))
    data_fem = data_func(df1)
    data_male = data_func(df2)

    fig = {
        "layout": {
            "title": title,
            "xaxis": {
                "tickangle": -45
            },
            "barmode": "stack",
            "width": 700,
            "annotations": []
        },
        "data": [
            {
                "x": x_func(data_fem),
                "y": y_func(data_fem),
                "type": "bar",
                "name": title_df1,
                "marker": {
                    "color": 'rgb(225,225,258)',
                    "line": {
                        "color": 'rgb(0,0,192)',
                        "width": 1.5
                    }
                },
            },
            {
                "x": x_func(data_male),
                "y": y_func(data_male),
                "type": "bar",
                "name": title_df2,
                "marker": {
                    "color": 'rgb(258,225,225)',
                    "line": {
                        "color": 'rgb(192,0,0)',
                        "width": 1.5
                    }
                },
            }
        
        ]
    }


#    for i in range(0, len(kitchen_type)):
#        fig["layout"]["annotations"].append(
#            {
#                 "x": map(lambda x: x.capitalize(), kitchen_perc.index.values)[i],
#                 "y": kitchen_perc.values[i] + 1.5,
#                "x": pd.concat([df1, df2])[i],
#                "y": y_func(data_global)[i] + 1.5,
#                "text": str(data_global.values[i]) + "%",
#                "font": dict(
#                    family='Arial',
#                    size=14,
#                    color='rgba(0, 0, 100, 1)'
#                ),
#                "showarrow": False,
#    
#            }
#        )
#
#
    iplot(fig)
    

def plot_wealth_criteria(groupby_data, values, title):
    groupby_data = groupby_data * 100 / groupby_data.sum()
    poor = groupby_data[groupby_data.index <= values[0]]
    medium = groupby_data[groupby_data.index > values[0]]
    medium = medium[medium.index <= values[1]]
    rich = groupby_data[groupby_data.index > values[1]]
    rich = rich[rich.index <= values[2]]
    print medium
    
    fig_distribution = {
        "layout": {
            "title": title,
            "yaxis": {
                "ticksuffix": "%"
            },
            "width": 700,
            "annotations": []
        },
        "data": [
            {
                "x": poor.index.values,
                "y": poor.values,
                "type": "bar",
                "name": "Very poor (" + str(round(poor.values.sum(), 1)) + "%)" ,
                "marker": {
                    "color": 'rgb(214,39,40)',
                }
            },
            {
                "x": medium.index.values,
                "y": medium.values,
                "type": "bar",
                "name": "Poor (" + str(round(medium.values.sum(), 1)) + "%)",
                "marker": {
                    "color": 'rgb(255,127,14)',
                }
            },
            {
                "x": rich.index.values,
                "y": rich.values,
                "type": "bar",
                "name": "Medium (" + str(round(rich.values.sum(), 1)) + "%)",
                "marker": {
                    "color": 'rgb(44,160,44)',
                }
            }
        ]
    }
    
    iplot(fig_distribution)

def plot_scatter(value_x, value_y, title):
    graph = {
        "layout": {
            "title": title,
            "xaxis": {
                "title": "Number of goats"
            },
            "yaxis": {
                "title": "Number of sheeps"
            }
        },
        "data": [
            {
    
                "x": value_x,
                "y": value_y,
                "mode": "markers",
                "marker": {
                    "size": prueba.number * 5               
                },
                "type": "scatter"
            }
        ]
    }
    iplot(graph)

## Model Generation and validation

### Model selection

### Model training

### Model evaluation

## Visualize results