In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 10 17:34:18 2020

@author: Guillermo Sánchez Gutiérrez-Cabello
"""
#%%
import numpy as np
import pandas as pd
import os
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

import nltk
nltk.download("popular") # required to download the stopwords lists
from nltk.corpus import stopwords

spanish_stopwords = stopwords.words('spanish')
english_stopwords = stopwords.words('english')
#%%
mainpath = "C:/Users/guill/Documents/Universidad/PlataformaRefugiados/NAUTIA/DesarrolloPy/DataSetOriginales"

def dfFix(df,col1 = False,col2 = False):
    result = df.copy()
    if(col1):
        x = result.columns.get_loc(col1)
        result.drop(result.columns[0:x],axis = 1, inplace = True)
    if(col2):
        y = result.columns.get_loc(col2)
        result.drop(result.columns[y:],axis = 1, inplace = True)
    return result

def concatDF(df1,df2):
    return  pd.concat([df1,df2],axis = 1, ignore_index = True, sort = True)

def dropRow(df,i):
    return df.drop(index = i)

def mkCSV(df,fileName):
    df = df.dropna(how = 'all')
    df *= 1  #Cambia columnas Booleanas por [0,1] y el resto de datos los mantiene igual.
    #df = df.fillna(-1)   
    fileName = fileName.lower()
    df.to_csv('DataSetFinales/'+fileName,sep=',',header = False, index=False, encoding='utf-8') #Header e index a false para no mostrarlo en el csv
    
def getPath(mainpath,filename):
    return os.path.join(mainpath, filename)

def fixBibliography(df):
    df = dfFix(df,"GENERAL INFORMATION - COUNTRY LEVEL")
    df.columns = ['GeneralInfo', 'CommunityCountry', 'RefugeeCountry']
    df.set_index('GeneralInfo', inplace = True)
    df = df.transpose()
    df.reset_index(inplace = True)
    return df

def getSubColumnNames(df,x):
    columns = df.columns
    array = []
    for column in columns:
        column = column[x:]
        array.append(column)
    return pd.DataFrame(array) 

def addInstitutionAndType(df,array1,array2,instType):
    df = df.dropna(axis = 1)
    df = np.array(df)
    for row in df:
        for elem in row:
            array1 = np.append(array1,elem)
            array2 = np.append(array2,instType)
    return array1,array2

def politicalActor(df1,df2,df3,df4,df5):
    institution = []
    instType = []

    institution, instType  = addInstitutionAndType(df1,institution,instType,'Public Institution')
    institution, instType  = addInstitutionAndType(df2,institution,instType,'Private Institution')
    institution, instType  = addInstitutionAndType(df3,institution,instType,'NGO')
    institution, instType  = addInstitutionAndType(df4,institution,instType,'International Agency')
    institution, instType  = addInstitutionAndType(df5,institution,instType,'Local')

    institution = pd.DataFrame(institution)
    institution = institution.reset_index(drop = True)
    instType = pd.DataFrame(instType)
    instType = instType.reset_index(drop = True)
    
    return concatDF(institution,instType)

def get_claveValor(df1,df2):
    array1 = np.array(df2)
    array2 = np.array(df1)
    result1 = []
    result2 = []
    i = 0
    for row in array1:
        var = array2[i]
        for elem in row:
            result1 = np.append(result1,elem)
            result2 = np.append(result2,var)
        i+=1
    result2 = pd.DataFrame(result2)
    result2 = result2.reset_index(drop = True)
    result1 = pd.DataFrame(result1)
    result1 = result1.reset_index(drop = True)
    return concatDF(result2,result1)

def get_FSClaveValor(df1,df2):
    df2 = df2.transpose()
    array = np.array(df2)
    array2 =[]
    i = 0
    for row in array:
        for elem in row:
            array2 = np.append(array2,elem)
        i+=1  
    df2 = pd.DataFrame(array2)
    return concatDF(df1,df2)

def get_valueBySector(df1,df2):
    df2 = df2.reset_index()
    array1 = np.array(df1)
    i = 0
    for row in array1:
        for elem in row:
            if(elem == False):
                df2 = dropRow(df2,i)
        i += 1
    df2 = df2.set_index('index')
    return df2

def separateValues(df):
    array = np.array(df)
    corpus = []
    for row in array:
        for elem in row:
            corpus = np.append(corpus,[elem])
    X = count_vectorizer.fit_transform(corpus)
    array = count_vectorizer.get_feature_names()
    return pd.DataFrame(array)  

def vectorizeValue(df):
    df = separateValues(df)
    year = np.array(['january','february','march','april','may','june','july','august','september','october','november','december'])
    result = np.array([],dtype = bool)
    df = np.array(df)
    for elem in year:
        flag = False
        for column in df:
            for month in column:
                if(column == elem):
                    flag = True
        if(flag):
            result = np.append(result,True)
        else:
            result = np.append(result,False)
    return pd.DataFrame(result)

def set_sector(df,sect, concat = True):
    sector = np.array([])
    df = df.dropna(how = 'all')
    df = np.array(df)
    for column in df:
        sector = np.append(sector,sect)
    sector = pd.DataFrame(sector)
    df = pd.DataFrame(df)
    if(concat):
        result = concatDF(sector,df)
    else:
        result = sector
    return result 


[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

In [2]:
def validColumn(cad):
    result = False
    if(cad == "index"):
        result = True
    else:
        if(cad == "Type_of_settlement"):
            result = True
        else:
            if(cad == "General:settlement"):
                result = True
            else:
                if(cad == "general_info:_1_1_Choose_the_settlement"):
                    result = True
                else:
                    if(cad == "General_Information:Type_of_setlement"):
                        result == True
                    else:
                        if(cad == "General:Settlement"):
                            result = True
                        else:
                            if(cad == "Type_of_setlement"):
                                result = True
    return result

In [17]:
def setDataByIndex(df,communityType):
    array = df.columns
    i = 0
    while(validColumn(array[i]) == False):
        #print(i)
        i += 1
    if(array[i] == "index"):
        df = df.set_index([array[i]])
        if(communityType == 1):
            result = df.loc['RefugeeCountry'].copy()
        else:
            result = df.loc['CommunityCountry'].copy()
    else:
        df[array[i]].loc[(df[array[i]] == "refugee")] = "refugee_camp"
        df[array[i]].loc[(df[array[i]] == "host_comunity")] = "host_community"
        df = df.set_index([array[i]])
        if(communityType == 1):
            result = df.loc['refugee_camp'].copy()
        else:
            result = df.loc['host_community'].copy()
    return result

In [18]:
#%% CSV to DataFrame
communityType = 0
Bibliography = pd.read_excel(getPath(mainpath,"Bibliography_120220.xlsx"))
Bibliography = fixBibliography(Bibliography)
Bibliography = pd.DataFrame(setDataByIndex(Bibliography,1)).T
Entities = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Entities_Interview_results.csv")),communityType)
LocalLeaders = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Local_leaders_v3_results.csv")),communityType)
HouseHold = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Survey_household_v6_results.csv")),communityType)
WomenGroup = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Women_Focus_Group2_results.csv")),communityType)
SanitationInfra = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_V1_0_Sanitation_Infrastructre_results.csv")),communityType)
Priorities = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Priorities_v3_results.csv")),communityType)
GeneralForm = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_General_form_v3_results.csv")),communityType)
PublicSpace = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Public_Space_results.csv")),communityType)
WaterInf = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Water_Infrastructure_results.csv")),communityType)
SanitationInf = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_V1_0_Sanitation_Infrastructre_results.csv")),communityType)
WasteManagementInf = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Waste_Management_Infrastructure_results.csv")),communityType)
EnergyINF = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Energy_Infrastructure_results.csv")),communityType)
Business = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA1_0_Business_surveys_v3_results.csv")),communityType)
MobilityINF = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0__Transport_servicesaccess_points_results.csv")),communityType) 
#ComunalServices = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Communal_Services_results.csv")),communityType) 
GeneralCitizen = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_General_Citizen_Focus_Group_results.csv")),communityType)
Shelter = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Shelter_results.csv")),communityType)
FarmyardCrop = setDataByIndex(pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Farmyard_and_Crops_results.csv")),communityType)

KeyError: 'the label [host_community] is not in the [index]'

In [10]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [16]:
Bibliography

GeneralInfo,Population,Country's name,Number of inhabitants (#),Mujeres menores de 5 años (%),Hombres menores de 5 años (%),Mujeres entre 5-17 años (%),Hombres Entre 5-17 años (%),Mujeres entre 18-59 años (%),Hombres entre 18-59 años (%),Mujeres mayores de 60 años (%),Hombres mayores de 60 años (%),Total population,Total children younger than 5 years (%),Total population between 5-17 years (%),Total population between 18 to 59 years (%),Total population 60 años (%),Growth rate of populatoin (%),Refugee population (%),Human development index (points),Life expentancy at birth (years),Culture,Ethnic groups,Ethnich group 1,Ethnich group 2,Ethnich group 3,Religion,Religion 1,Religion 2,Religion 3,Language,Language 1,Language 2,Language 3,Economy and well-being,Population with access to employment,Agriculture (%),Livestock (%),Industry (%),Services (%),Poor people rate (%),Index GINI (points),PIB per cap (USD),Poverty line (USD/day),Local currency,Exchange rate (local currency vs USD),Government,System of governance (choose one),Parliamentary republic,Presidential republic,Syngle-party republic,Parliamentary monarchy,Absolute monarchy,Dictatorship,Another,Territorial and Urbanistic,Location,Urban population (%),Rural population (%),Population density,Urban (inhabitants/hectares),Rural (inhabitants/hectares),Infrastructures,Access to drinking water supply,Total (%),Rural agua (%),Urbano agua (%),Access to improved sanitation,Total (%).1,Rural saneamiento(%),Urbano saneamiento (%),Access to electricity,Total (%).2,Rural electricidad (%),Urbano electricidad (%),Local electrificy tariff (local currency/kWh),Matrix of electricity generation,Hydropower (%),Diesel generation (%),Gas (%),Coal (%),Solar photovoltaic (%),Wind power (%),Biomass/biofuels (%),High voltage (kV),Low voltage (V),Access to Services,Illiteracy rate (%),Internet access rate (%),Shelter,Slum population rate (%),SPECIFIC INFORMATION - SETTLEMENTS LEVEL,CULTURE IDENTIFICATION,SETTLEMENTS Religion,SETTLEMENTS Religion 1,SETTLEMENTS Religion 2,SETTLEMENTS Religion 3,SETTLEMENTS Language,SETTLEMENTS Language 1,SETTLEMENTS Language 2,SETTLEMENTS Language 3,PHYSICAL AND ENVIRONMENT DATA,Location.1,Latitude,Longitud,Topography,Upper bound (m),Lower bound (m),FOOD SECURITY,Cause of food insecurity,Cause 1,Cause 2,Cause 3,Affected groups due to food insecurity,Children,Women,Men,Old people,Calories of the typical dish,Pork (200 kcal/100g),Beef 180 kcal/100 g,Chicken: 130 kcal/100 g,Lamp: 125 kcal/100 g,Cereales/grains: 350 kcal/100 g,Legumes: 350 kcal/100 g,Fruits: 50-200 kcal/100 g,Intake (g) - default value 70g-,GENERAL INFORMATION OF REFUGEES SETTLEMENT,Implementation date of the refugee camp (year),Migration reasons,Reason 1,Reason 2,Reason 3,Reason 4,Climate,"Tropical (Write one: Af, Aw or Am)","Dry (Write one: Bsh, Bsk, Bwh or Bwk)","Temperated (Write one: Cdb, Cfc, Csa, Csb, Cfa, Cwa, Cwb)","Continental (Write one: Dfa, Bwa, Dsa or Dfb, Dwb, Dsb)",Temperature,Max (ºC),Min (ºC),Average (ºC),Relative humidity (%),Annual precipitation,Max (mm),Min (mm),Solar irradiance (kW/h),Wind speed km/h,Additional information,Heights of close rivers (r=20km),r.1,r.2,r.3,r.4,ACTORS (PARTNERS) IDENTIFICATION,Public institutions,pu.1,pu.2,pu.3,pu.4,pu.5,pu.6,pu.7,Private institutions,pr.1,pr.2,pr.3,pr.4,pr.5,pr.6,Non-profit organizations/NGOs,np.1,np.2,np.3,np.4,np.5,np.6,International cooperation agencies,int.1,int.2,int.3,int.4,int.5,int.6,Local representatives/local committees/ local liders,ld.1,ld.2,ld.3,ld.4,ld.5,ld.6
RefugeeCountry,Refugees' country,Eritrea,,12,8,10,15,25,15,15,10,,10,25,40,25,1.9,8,0.402,64.2,,,Kunama,Tigrinya,Saho,,Cristianismo,iIslam,,,Triginya,Kunama,,,,20,10,30,40,53,,236,1.25,Nakfa,"1ERN = 0,065 USD",,,No,Si,No,No,No,No,No,,,35.77,64.23,,3000,1200,,,47,,,,16,30,,,48.4,30.1,75.5,,,,,,,,,,,,,73.8,1.1,,70,,Refugees,,Cristianismo,Islam,,,Tigrinya,Kunama,,Refugees,,15.1794,39.7823,,60,30018,Refugees,,droughts,poor quality of food,food scarcity,,yes,no,no,yes,,2,1.8,1.3,1.25,3.5,3.5,2,70,Refugees,,,war,poor,politic persecution,,,Aw,,,,,25,15,20,,,129,13,7.3,13.6,,,,,,,Refugees,,Administration for Refugee and Returnee Affair...,National Resource Development and Environmenta...,Ethiopian Electric Utility,Asociación Española de Cooperación Internacion...,,,,,IBERDROLA S.A.,Fundación ACCIONA Microenergía,itdUPM,Philips,,,,International Rescue Committee (IRC),Norwegian Refugee Council (NRC),Jesuit Refugee Service (JRS),Centre of Victims of Torture (CVT),Medecins Sans Frontieres (MSF-H),Innovative Humanitarian Solutions (IHS,,World Food Program (WFP),United Nations High Commissioner for Refugees ...,United Nations Childrens Fund (UNICEF),International Organization for Migration (IOM),International Committee of RED Cross (ICRC),,,"Ethiopian Orthodox Church, Development and Int...",Opportunities Industrialization Centre - Ethio...,,,,


In [None]:
Bibliography

In [None]:
Entities

In [None]:
Entities

In [None]:
LocalLeaders

In [None]:
HouseHold

In [None]:
WomenGroup

In [None]:
SanitationInfra

In [None]:
Priorities

In [None]:
GeneralForm

In [None]:
PublicSpace

In [None]:
WaterInf

In [None]:
SanitationInf

In [None]:
WasteManagementInf

In [None]:
EnergyINF

In [None]:
Business

In [None]:
MobilityINF

In [None]:
ComunalServices

In [None]:
GeneralCitizen

In [None]:
Shelter

In [None]:
FarmyardCrop

In [None]:
PublicSpace

In [None]:
    dfPublic = dfFix(Bibliography,"Public institutions","Private institutions")
    dfPrivate = dfFix(Bibliography,"Private institutions","Non-profit organizations/NGOs")
    dfNonProfit = dfFix(Bibliography,"Non-profit organizations/NGOs","International cooperation agencies")
    dfInternational = dfFix(Bibliography,"International cooperation agencies","Local representatives/local committees/ local liders")
    dfLocal = dfFix(Bibliography,"Local representatives/local committees/ local liders")
    G_PoliticalActor = politicalActor(dfPublic,dfPrivate,dfNonProfit,dfInternational,dfLocal)
    mkCSV(G_PoliticalActor,"G_PoliticalActor.csv")

In [None]:
G_PoliticalActor