In [4]:
import xlrd
import pandas as pd
import numpy as np
import json
import unidecode
from functools import reduce

In [5]:
def rawDf():
    df = pd.read_excel('input/youth_survey_raw.xlsx')

    # get question ids from columns
    question_ids = [ c for c in df.columns if 'Q' in c ]

    # copy row 0 to column names
    df.columns = df.iloc[0]

    # delete row 0
    df = df.drop(0, axis=0)
    df = df.applymap(str)
    return df

raw_df = rawDf()
raw_df.head()

Unnamed: 0,M1: Questionnaire No,Project No,Project Name,M2: Governorate,M3: District,M7: Urbanity,M8: Date of interview,M8a: Day of interview,M9: Interview Validation,M11b: Date of Auditing,...,Q61_3_X: Other,S1: How old are you,S2: What is your marital status?,S3: Do you have any children?,"S4: (If ""yes""), How many?","S5: Including you, how many people, children, above and under 15 live in this household?",S6: What is the highest education level you completed?,"S7: Which one of the folowing income is close to your household monthly income, including all household members incomes live in or out this house?",S8: What is your job status today?,S11: Gender of respondent
1,79,132,Youth Survey,Ibb,Al-Sabrah,Rural,2017-05-11 00:00:00,Thu,Not validated,2017-05-11 00:00:00,...,,17,"Single, didn't not get married before",,,6,Didn't complete secondary school,"20,000 YER- 39,000 YER",Student,Male
2,101,132,Youth Survey,Ibb,Al-Makhader,Rural,2017-05-12 00:00:00,Fri,Accompanied interview,2017-05-12 00:00:00,...,,16,"Single, didn't not get married before",,,6,Completed intermediate school,"20,000 YER- 39,000 YER",Student,Male
3,172,132,Youth Survey,Abyan,Khanfar,Rural,2017-05-10 00:00:00,Wed,Re-contacted by person,2017-05-10 00:00:00,...,,25,Married,Yes,4.0,6,Illiterate (Can't read and write),"Less than 20,000 YER",Housewife,Female
4,1107,132,Youth Survey,Shabwah,Jardaan,Rural,2017-05-23 00:00:00,Tue,Not validated,2017-05-23 00:00:00,...,,19,Married,No,,9,Completed secondary school,"60,000 YER - 79,000 YER",Housewife,Female
5,1439,132,Youth Survey,ad-Dali,Qatabah,Rural,2017-05-10 00:00:00,Wed,Not validated,2017-05-10 00:00:00,...,,18,"Single, didn't not get married before",,,13,Didn't complete secondary school,"60,000 YER - 79,000 YER",Student,Male


In [6]:
# globals
question_columns = [col.strip() for col in raw_df.columns if col.startswith(('Q'))]
demographic_columns = [col.strip() for col in raw_df.columns if col.startswith(('S'))]

idMap = {
        "abyan":"2",
        "ad-dali":"19",
        "aden":"13",
        "al-baidha":"4",
        "al-mahra":"17",
        "al-mahweet":"16",
        "alhudaida":"8",
        "aljawf":"6",
        "amran":"18",
        "dhamar":"10",
        "hadhramawt":"9",
        "hajja":"7",
        "ibb":"1",
        "lahj":"14",
        "marib":"15",
        "rayma":"20",
        "sana'a city":"3",
        "sana'a":"12",
        "shabwah":"11",
        "taiz":"5"
    }

In [7]:
def filterDf(df):    
    df_ = df.filter(items=question_columns)
    df_['governorate'] = df['M2: Governorate']
    df_['district'] = df['M3: District']
    df_['urbanity'] = df['M7: Urbanity']
    df_['household_monthly_income'] = df['S7: Which one of the folowing income is close to your household monthly income, including all household members incomes live in or out this house?']
    df_['age'] = df['S1: How old are you']
    df_['marital_status'] = df['S2: What is your marital status?'].apply(lambda x: x.split(', ')[0])
    df_['have_children'] = df['S3: Do you have any children?']
    df_['job_status'] = df['S8: What is your job status today?'].apply(lambda x: x.replace('Unemployed, looking for job', 'Unemployed').replace('Works in his/her own business', 'Self-employed'))
    df_['gender'] = df['S11: Gender of respondent']
    
    return df_

filtered_df = filterDf(raw_df)
filtered_df.head()

Unnamed: 0,Q1: First I would like to ask you about your current situation / family situation. How would you rate it on a scale from 1 to 5?,"Q2: In general, what do you think about the current situation in Yemen, is it going in the right direction or in the wrong direction?",Q3: What is your status:,Q3x: Other,"Q4_1: What are the reasons you had to stop going to school, university, vocational training or lost your job? First answer",Q4_1_X :Other,"Q4_2: What are the reasons you had to stop going to school, university, vocational training or lost your job? Second answer",Q4_2_X :Other,Q5: How much time do you spend at school/university/work/vocational training?,Q6: How close is the school/university/work/vocational training you attend to your house?,...,Q61_3_X: Other,governorate,district,urbanity,household_monthly_income,age,marital_status,have_children,job_status,gender
1,Neither bad nor good,In a very bad direction,School student,,,,,,5-8 hours,0-1 km,...,,Ibb,Al-Sabrah,Rural,"20,000 YER- 39,000 YER",17,Single,,Student,Male
2,It’s very bad,"3.Not in the right direction, not in the wrong...",School student,,,,,,1-4 hours,0-1 km,...,,Ibb,Al-Makhader,Rural,"20,000 YER- 39,000 YER",16,Single,,Student,Male
3,It’s very bad,In a very bad direction,I am a housewife,,,,,,,,...,,Abyan,Khanfar,Rural,"Less than 20,000 YER",25,Married,Yes,Housewife,Female
4,Neither bad nor good,"3.Not in the right direction, not in the wrong...",I am a housewife,,,,,,,,...,,Shabwah,Jardaan,Rural,"60,000 YER - 79,000 YER",19,Married,No,Housewife,Female
5,It’s somewhat good,In a very bad direction,School student,,,,,,1-4 hours,0-1 km,...,,ad-Dali,Qatabah,Rural,"60,000 YER - 79,000 YER",18,Single,,Student,Male


In [9]:
def mapDf(df):
    df_a_list = []

    for q in question_columns:
        df_a = pd.DataFrame()
        
        q_ = unidecode.unidecode(q)
        qID = q_.split(':')[0]
        q_ = q_.split(':')[1].strip()

        if q_ != 'Other':
            df_a['id'] = df['governorate'].apply(lambda x: idMap[str(x).lower()])
            df_a['governorate'] = df['governorate']
            df_a['district'] = df['district']
            df_a['qID'] = qID
            df_a['question_raw'] = q_
            df_a['answer_raw'] = df[q]
            df_a['urbanity'] = df['urbanity']
            df_a['household_monthly_income'] = df['household_monthly_income']
            df_a['age'] = df['age']
            df_a['marital_status'] = df['marital_status']
            df_a['have_children'] = df['have_children']
            df_a['job_status'] = df['job_status']
            df_a['gender'] = df['gender']

        df_a_list.append(df_a)

    return pd.concat(df_a_list)

mapped_df = mapDf(filtered_df)
mapped_df.head()

Unnamed: 0,id,governorate,district,qID,question_raw,answer_raw,urbanity,household_monthly_income,age,marital_status,have_children,job_status,gender
1,1,Ibb,Al-Sabrah,Q1,First I would like to ask you about your curre...,Neither bad nor good,Rural,"20,000 YER- 39,000 YER",17,Single,,Student,Male
2,1,Ibb,Al-Makhader,Q1,First I would like to ask you about your curre...,It’s very bad,Rural,"20,000 YER- 39,000 YER",16,Single,,Student,Male
3,2,Abyan,Khanfar,Q1,First I would like to ask you about your curre...,It’s very bad,Rural,"Less than 20,000 YER",25,Married,Yes,Housewife,Female
4,11,Shabwah,Jardaan,Q1,First I would like to ask you about your curre...,Neither bad nor good,Rural,"60,000 YER - 79,000 YER",19,Married,No,Housewife,Female
5,19,ad-Dali,Qatabah,Q1,First I would like to ask you about your curre...,It’s somewhat good,Rural,"60,000 YER - 79,000 YER",18,Single,,Student,Male


In [10]:
def calculateDf(df):
    answer_count = df.groupby(['governorate', 'question_raw', 'answer_raw']).transform('count')
    answer_total = df.groupby(['governorate', 'question_raw']).transform('count')

    df['answer_count'] = ""
    df['answer_count'] = answer_count

    df['answer_total'] = ""
    df['answer_total'] = answer_total

    df['answer_pct'] = (df['answer_count'] / df['answer_total'])*100

    df.head()
    return df

calculated_df = calculateDf(mapped_df)
calculated_df.head()

Unnamed: 0,id,governorate,district,qID,question_raw,answer_raw,urbanity,household_monthly_income,age,marital_status,have_children,job_status,gender,answer_count,answer_total,answer_pct
1,1,Ibb,Al-Sabrah,Q1,First I would like to ask you about your curre...,Neither bad nor good,Rural,"20,000 YER- 39,000 YER",17,Single,,Student,Male,61,170,35.882353
2,1,Ibb,Al-Makhader,Q1,First I would like to ask you about your curre...,It’s very bad,Rural,"20,000 YER- 39,000 YER",16,Single,,Student,Male,53,170,31.176471
3,2,Abyan,Khanfar,Q1,First I would like to ask you about your curre...,It’s very bad,Rural,"Less than 20,000 YER",25,Married,Yes,Housewife,Female,13,30,43.333333
4,11,Shabwah,Jardaan,Q1,First I would like to ask you about your curre...,Neither bad nor good,Rural,"60,000 YER - 79,000 YER",19,Married,No,Housewife,Female,16,40,40.0
5,19,ad-Dali,Qatabah,Q1,First I would like to ask you about your curre...,It’s somewhat good,Rural,"60,000 YER - 79,000 YER",18,Single,,Student,Male,9,40,22.5


In [11]:
# external D4C data to be joined later
def getTags():
    xls = pd.ExcelFile('input/tagged.xlsx')
    tagged_questions = pd.read_excel(xls, 'questions')
    tagged_answers = pd.read_excel(xls, 'answers')


    df = tagged_questions.merge(tagged_answers, on = ['qID', 'question_raw'], how = 'outer')
    return df

tags_df = getTags()
tags_df


Unnamed: 0,qID,question_raw,question_en,question_ar,question_tag,question_tag_order,question_map,question_keep,question_type,question_scale,aID,answer_raw,answer_en,answer_ar,answer_rank
0,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a1,Don't know,,,dk
1,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a2,It is very good,Very good,,5
2,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a3,It's somewhat bad,Bad,,2
3,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a4,It's somewhat good,Good,,4
4,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a5,It's very bad,Very bad,,1
5,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,y,y,scale,5.0,Q1_a6,Neither bad nor good,Average,,3
6,Q2,"In general, what do you think about the curren...",Is the situation in Yemen heading in the right...,,politics,1.0,y,y,scale,5.0,Q2_a1,Don't know,,,dk
7,Q2,"In general, what do you think about the curren...",Is the situation in Yemen heading in the right...,,politics,1.0,y,y,scale,5.0,Q2_a2,"Not in the right direction, not in the wrong d...",Neither the right nor the wrong direction,,3
8,Q2,"In general, what do you think about the curren...",Is the situation in Yemen heading in the right...,,politics,1.0,y,y,scale,5.0,Q2_a3,In a very bad direction,The wrong direction,,1
9,Q2,"In general, what do you think about the curren...",Is the situation in Yemen heading in the right...,,politics,1.0,y,y,scale,5.0,Q2_a4,In the right direction,The right direction,,5


In [13]:
def mergeDf(raw, tags, kind):
    df = raw.merge(tags, on = ['qID', 'question_raw', 'answer_raw'], how = 'outer')
    
    df = df[df['question_keep'] == 'y']
    mapdf = df[df['question_map'] == 'y']

    df = df[['id','governorate', 'district', 'qID', 'question_raw', 'question_en', 'question_ar', 'question_tag', 'question_tag_order', 'question_type', 'question_map', 'question_scale', 'answer_raw', 'answer_en', 'answer_ar', 'answer_count', 'answer_total', 'answer_pct', 'answer_rank', 'urbanity', 'household_monthly_income', 'age', 'marital_status', 'job_status', 'gender', 'have_children']]
    mapdf = mapdf[['id','governorate', 'district', 'qID', 'question_raw', 'question_en', 'question_ar', 'question_tag', 'question_tag_order', 'question_type', 'question_map', 'question_scale', 'answer_raw', 'answer_en', 'answer_ar', 'answer_count', 'answer_total', 'answer_pct', 'answer_rank', 'urbanity', 'household_monthly_income', 'age', 'marital_status', 'job_status', 'gender', 'have_children']]

    if kind == 'detail':
        return df
    else:
        return mapdf

detail_df = mergeDf(calculated_df, tags_df, 'detail')
map_df = mergeDf(calculated_df, tags_df, 'map')

detail_df.head()

Unnamed: 0,id,governorate,district,qID,question_raw,question_en,question_ar,question_tag,question_tag_order,question_type,...,answer_total,answer_pct,answer_rank,urbanity,household_monthly_income,age,marital_status,job_status,gender,have_children
0,1,Ibb,Al-Sabrah,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,scale,...,170.0,35.882353,3,Rural,"20,000 YER- 39,000 YER",17,Single,Student,Male,
1,11,Shabwah,Jardaan,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,scale,...,40.0,40.0,3,Rural,"60,000 YER - 79,000 YER",19,Married,Housewife,Female,No
2,9,Hadhramawt,Al-Mukala City,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,scale,...,80.0,21.25,3,Urban,(DK),16,Single,Student,Female,
3,1,Ibb,Al-Sabrah,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,scale,...,170.0,35.882353,3,Rural,"40,000 YER - 59,000 YER",25,Widow,Housewife,Female,Yes
4,1,Ibb,Thee Al-Sufal,Q1,First I would like to ask you about your curre...,How would you rate you and your family's curre...,,daily life,1.0,scale,...,170.0,35.882353,3,Rural,"60,000 YER - 79,000 YER",17,Single,Student,Female,


In [14]:
def cleanDf(df):
    df_ = df[['id', 'governorate', 'question_tag', 'question_tag_order', 'question_map', 'question_raw', 'question_en', 'question_ar', 'answer_raw', 'answer_en', 'answer_ar', 'answer_rank', 'answer_count', 'answer_pct']]
    df_ = df_.sort_values(['id', 'question_tag', 'question_tag_order', 'answer_rank'])
    df_ = df_.drop_duplicates()
    df_ = df_[df_['answer_rank'] != 'removed / repeated']
    df_ = df_[df_['id'].notnull()]
    df_ = df_.fillna('')
    
    return df_

detail_clean_df = cleanDf(detail_df)
detail_clean_df.head()

Unnamed: 0,id,governorate,question_tag,question_tag_order,question_map,question_raw,question_en,question_ar,answer_raw,answer_en,answer_ar,answer_rank,answer_count,answer_pct
117714,1,Ibb,basic needs,1.0,y,Please tell me if Adequate shelter/housing is...,Does your family have access to adequate shelt...,,Not available at all,"No, never",,1,1.0,0.588235
118434,1,Ibb,basic needs,1.0,y,Please tell me if Adequate shelter/housing is...,Does your family have access to adequate shelt...,,Somewhat not available,Only sometimes,,2,7.0,4.117647
117786,1,Ibb,basic needs,1.0,y,Please tell me if Adequate shelter/housing is...,Does your family have access to adequate shelt...,,Somewhat available,Sometimes yes,,3,42.0,24.705882
116976,1,Ibb,basic needs,1.0,y,Please tell me if Adequate shelter/housing is...,Does your family have access to adequate shelt...,,Available,Yes,,4,39.0,22.941176
115804,1,Ibb,basic needs,1.0,y,Please tell me if Adequate shelter/housing is...,Does your family have access to adequate shelt...,,Very available,"Yes, always",,5,81.0,47.647059


In [15]:
q62_df = pd.read_csv('input/q62.csv')
q62_df['id'] = q62_df['id'].apply(lambda x: str(x))
q62_df = q62_df.fillna('')

detail_plus_q62 = pd.concat([q62_df, detail_clean_df])

detail_plus_q62.head()

Unnamed: 0,id,governorate,question_tag,question_tag_order,question_map,question_raw,question_en,question_ar,answer_raw,answer_en,answer_ar,answer_rank,answer_count,answer_pct
0,1,Ibb,international relations,10.0,,How do you evaluate the role that Oman plays i...,"In your opinion, is Oman playing a positive or...",,Very negative,Very negative,,1,16.0,9.411765
1,2,Abyan,international relations,10.0,,How do you evaluate the role that Oman plays i...,"In your opinion, is Oman playing a positive or...",,Very negative,Very negative,,1,5.0,16.666667
2,3,Sana'a City,international relations,10.0,,How do you evaluate the role that Oman plays i...,"In your opinion, is Oman playing a positive or...",,Very negative,Very negative,,1,61.0,43.571429
3,4,al-Baydha',international relations,10.0,,How do you evaluate the role that Oman plays i...,"In your opinion, is Oman playing a positive or...",,Very negative,Very negative,,1,0.0,0.0
4,5,Taiz,international relations,10.0,,How do you evaluate the role that Oman plays i...,"In your opinion, is Oman playing a positive or...",,Very negative,Very negative,,1,53.0,27.894737


In [19]:
detail_plus_q62.to_csv('output/ypc_edit.csv', index=False)

In [15]:
# into json format for charting
question_cols = ['id', 'governorate', 'question_tag', 'question_tag_order', 'question_raw', 'question_en', 'question_ar']
answer_cols = ['answer_raw', 'answer_en', 'answer_ar', 'answer_rank', 'answer_count', 'answer_pct']

def build_question_dict(df_):
    result = {
        c: df_[c].values[0] for c in question_cols
    }

    result['values'] = []
    for _, row in df_.groupby('answer_raw'):

        result['values'].append({
            c: row[c].values[0] for c in answer_cols        
        })
        
    return result


for key, df_ in detail_plus_q62.groupby( ['id'] ):
    
    results = []
    for id_, df__ in df_.groupby('question_raw'):
        
        results.append( build_question_dict(df__) )
    
    with open('output/gov/gov_'+key+'.json', 'w') as outfile:
        print ('output/gov/gov_'+key+'.json')
        json.dump(results, outfile)


output/gov/gov_1.json
output/gov/gov_10.json
output/gov/gov_11.json
output/gov/gov_12.json
output/gov/gov_13.json
output/gov/gov_14.json
output/gov/gov_15.json
output/gov/gov_16.json
output/gov/gov_17.json
output/gov/gov_18.json
output/gov/gov_19.json
output/gov/gov_2.json
output/gov/gov_20.json
output/gov/gov_3.json
output/gov/gov_4.json
output/gov/gov_5.json
output/gov/gov_6.json
output/gov/gov_7.json
output/gov/gov_8.json
output/gov/gov_9.json


In [16]:
def getMapData(df):
    return df[df['question_map'] == 'y']
    
map_df = getMapData(detail_plus_q62)
map_df.to_csv('output/map.csv', index=False)
map_df.head()

Unnamed: 0,id,governorate,question_tag,question_tag_order,question_map,question_raw,question_en,question_ar,answer_raw,answer_en,answer_ar,answer_rank,answer_count,answer_pct
120,1,Ibb,international relations,2.0,y,How do you evaluate the role that US plays in ...,"In your opinion, is the US playing a positive ...",,Very negative,Very negative,,1,108.0,63.529412
121,2,Abyan,international relations,2.0,y,How do you evaluate the role that US plays in ...,"In your opinion, is the US playing a positive ...",,Very negative,Very negative,,1,4.0,13.333333
122,3,Sana'a City,international relations,2.0,y,How do you evaluate the role that US plays in ...,"In your opinion, is the US playing a positive ...",,Very negative,Very negative,,1,129.0,92.142857
123,4,al-Baydha',international relations,2.0,y,How do you evaluate the role that US plays in ...,"In your opinion, is the US playing a positive ...",,Very negative,Very negative,,1,13.0,32.5
124,5,Taiz,international relations,2.0,y,How do you evaluate the role that US plays in ...,"In your opinion, is the US playing a positive ...",,Very negative,Very negative,,1,78.0,41.052632


In [128]:
question_cols = ['id', 'governorate', 'question_tag', 'question_tag_order', 'question_raw', 'question_en', 'question_ar']
answer_cols = ['answer_raw', 'answer_en', 'answer_ar', 'answer_rank', 'answer_count', 'answer_pct']

def build_question_dict(df_):
    answer_ranks = []
    answer_pcts = []
    
    for _, row in df_.groupby('answer_raw'):
             
        if row['answer_rank'].values[0] != 'dk':
            answer_ranks.append(int(row['answer_rank'].values[0]))
            answer_pcts.append(row['answer_pct'].values[0])
    
    max_rank = max(answer_ranks)
    
    multiplier = (100/max_rank) / 100
    
    multiply_func = lambda x: (x * multiplier) / 100
    
    multiplied_list = list(map(multiply_func, answer_ranks))
    
    answer_pct_multiplied = [a*b for a,b in zip(answer_pcts, multiplied_list)]
    
    rank = sum(answer_pct_multiplied)
    
    return rank
    
      
results = []
for key, df_ in map_df.groupby( ['id'] ):
    
  
    for id_, df__ in df_.groupby('question_raw'):
        
        redict = {
            'id': key,
            'gov': df__['governorate'].values[0],
            'question_en': df__['question_en'].values[0],
            'question_ar': df__['question_ar'].values[0],
            'tag': df__['question_tag'].values[0], 
            'rank': build_question_dict(df__),
            'scale_upper_en': 'more',
            'scale_lower_en': 'less',
            'scale_upper_ar': 'أكثر',
            'scale_lower_ar': 'أقل'
        }
        
        results.append(redict)
    
    
    
with open('output/map.json', 'w') as outfile:
    print ()
    print (json.dumps(results, indent=2))
    json.dump(results, outfile)



[
  {
    "id": "1",
    "gov": "Ibb",
    "question_en": "Are political parties in your area active in a positive, negative, or neutral way, or simply not active at all?",
    "question_ar": "",
    "tag": "governance",
    "rank": 0.3011764705882353,
    "scale_upper_en": "more",
    "scale_lower_en": "less",
    "scale_upper_ar": "\u0623\u0643\u062b\u0631",
    "scale_lower_ar": "\u0623\u0642\u0644"
  },
  {
    "id": "1",
    "gov": "Ibb",
    "question_en": "Are the religious groups in your area active in a positive, negative, or neutral way, or simply not active at all?",
    "question_ar": "",
    "tag": "governance",
    "rank": 0.23411764705882354,
    "scale_upper_en": "more",
    "scale_lower_en": "less",
    "scale_upper_ar": "\u0623\u0643\u062b\u0631",
    "scale_lower_ar": "\u0623\u0642\u0644"
  },
  {
    "id": "1",
    "gov": "Ibb",
    "question_en": "Are the tribal leaders in your area active in a positive, negative, or neutral way, or simply not active at all?",
   