In [1]:
# -*- coding: utf-8 -*-
"""
Created on Mon Feb 10 17:34:18 2020

@author: Guillermo Sánchez Gutiérrez-Cabello
"""
#%%
import numpy as np
import pandas as pd
import os
import sklearn

from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

import nltk
nltk.download("popular") # required to download the stopwords lists
from nltk.corpus import stopwords

spanish_stopwords = stopwords.words('spanish')
english_stopwords = stopwords.words('english')
#%%
mainpath = "C:/Users/guill/Documents/Universidad/PlataformaRefugiados/NAUTIA/DesarrolloPy/DataSetOriginales"

def dfFix(df,col1 = False,col2 = False):
    result = df.copy()
    if(col1):
        x = result.columns.get_loc(col1)
        result.drop(result.columns[0:x],axis = 1, inplace = True)
    if(col2):
        y = result.columns.get_loc(col2)
        result.drop(result.columns[y:],axis = 1, inplace = True)
    return result

def concatDF(df1,df2):
    return  pd.concat([df1,df2],axis = 1, ignore_index = True, sort = True)

def dropRow(df,i):
    return df.drop(index = i)

def mkCSV(df,fileName):
    df = df.dropna(how = 'all')
    df *= 1  #Cambia columnas Booleanas por [0,1] y el resto de datos los mantiene igual.
    #df = df.fillna(-1)   
    fileName = fileName.lower()
    df.to_csv('DataSetFinales/'+fileName,sep=',',header = False, index=False, encoding='utf-8') #Header e index a false para no mostrarlo en el csv
    
def getPath(mainpath,filename):
    return os.path.join(mainpath, filename)

def fixBibliography(df):
    df = dfFix(df,"GENERAL INFORMATION - COUNTRY LEVEL")
    df.columns = ['GeneralInfo', 'CommunityCountry', 'RefugeeCountry']
    df.set_index('GeneralInfo', inplace = True)
    df = df.transpose()
    df.reset_index(inplace = True)
    return df

def getSubColumnNames(df,x):
    columns = df.columns
    array = []
    for column in columns:
        column = column[x:]
        array.append(column)
    return pd.DataFrame(array) 

def addInstitutionAndType(df,array1,array2,instType,index):
    refugees = dropRow(df,index)
    refugees = refugees.dropna(axis = 1)
    refugees = np.array(refugees)
    for row in refugees:
        for elem in row:
            array1 = np.append(array1,elem)
            array2 = np.append(array2,instType)
    return array1,array2

def politicalActor(df1,df2,df3,df4,df5,index):
    institution = []
    instType = []

    institution, instType  = addInstitutionAndType(df1,institution,instType,'Public Institution',index)
    institution, instType  = addInstitutionAndType(df2,institution,instType,'Private Institution',index)
    institution, instType  = addInstitutionAndType(df3,institution,instType,'NGO',index)
    institution, instType  = addInstitutionAndType(df4,institution,instType,'International Agency',index)
    institution, instType  = addInstitutionAndType(df5,institution,instType,'Local',index)

    institution = pd.DataFrame(institution)
    institution = institution.reset_index(drop = True)
    instType = pd.DataFrame(instType)
    instType = instType.reset_index(drop = True)
    
    return concatDF(institution,instType)

def get_claveValor(df1,df2):
    array1 = np.array(df2)
    array2 = np.array(df1)
    result1 = []
    result2 = []
    i = 0
    for row in array1:
        var = array2[i]
        for elem in row:
            result1 = np.append(result1,elem)
            result2 = np.append(result2,var)
        i+=1
    result2 = pd.DataFrame(result2)
    result2 = result2.reset_index(drop = True)
    result1 = pd.DataFrame(result1)
    result1 = result1.reset_index(drop = True)
    return concatDF(result2,result1)

def get_FSClaveValor(df1,df2):
    df2 = df2.transpose()
    array = np.array(df2)
    array2 =[]
    i = 0
    for row in array:
        for elem in row:
            array2 = np.append(array2,elem)
        i+=1  
    df2 = pd.DataFrame(array2)
    return concatDF(df1,df2)

def get_valueBySector(df1,df2):
    df2 = df2.reset_index()
    array1 = np.array(df1)
    i = 0
    for row in array1:
        for elem in row:
            if(elem == False):
                df2 = dropRow(df2,i)
        i += 1
    df2 = df2.set_index('index')
    return df2

def separateValues(df):
    array = np.array(df)
    corpus = []
    for row in array:
        for elem in row:
            corpus = np.append(corpus,[elem])
    X = count_vectorizer.fit_transform(corpus)
    array = count_vectorizer.get_feature_names()
    return pd.DataFrame(array)  

def vectorizeValue(df):
    df = separateValues(df)
    year = np.array(['january','february','march','april','may','june','july','august','september','october','november','december'])
    result = np.array([],dtype = bool)
    df = np.array(df)
    for elem in year:
        flag = False
        for column in df:
            for month in column:
                if(column == elem):
                    flag = True
        if(flag):
            result = np.append(result,True)
        else:
            result = np.append(result,False)
    return pd.DataFrame(result)

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]   

In [2]:
#%% CSV to DataFrame
Bibliography = pd.read_excel(getPath(mainpath,"Bibliography_120220.xlsx"))
Bibliography = fixBibliography(Bibliography)
Entities = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Entities_Interview_results.csv"))
LocalLeaders = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Local_leaders_v3_results.csv"))
HouseHold = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Survey_household_v6_results.csv"))
WomenGroup = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Women_Focus_Group2_results.csv"))
SanitationInfra = pd.read_csv(getPath(mainpath,"NAUTIA_V1_0_Sanitation_Infrastructre_results.csv"))
Priorities = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Priorities_v3_results.csv"))
GeneralForm = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_General_form_v3_results.csv"))
PublicSpace = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Public_Space_results.csv"))
WaterInf = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Water_Infrastructure_results.csv"))
SanitationInf = pd.read_csv(getPath(mainpath,"NAUTIA_V1_0_Sanitation_Infrastructre_results.csv"))
WasteManagementInf = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Waste_Management_Infrastructure_results.csv"))
EnergyINF = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Energy_Infrastructure_results.csv"))
Business = pd.read_csv(getPath(mainpath,"NAUTIA1_0_Business_surveys_v3_results.csv"))
MobilityINF = pd.read_csv(getPath(mainpath,"NAUTIA_1_0__Transport_servicesaccess_points_results.csv")) 
ComunalServices = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Communal_Services_results.csv")) 
GeneralCitizen = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_General_Citizen_Focus_Group_results.csv"))
Shelter = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Shelter_results.csv"))
FarmyardCrop = pd.read_csv(getPath(mainpath,"NAUTIA_1_0_Farmyard_and_Crops_results.csv"))

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [4]:
Bibliography.shape

(2, 192)

In [5]:
Entities.shape

(5, 72)

In [6]:
LocalLeaders.shape

(8, 22)

In [31]:
HouseHold.shape

(29, 42)

In [8]:
WomenGroup.shape

(4, 27)

In [9]:
SanitationInfra.shape

(43, 14)

In [10]:
Priorities.shape

(5, 88)

In [11]:
GeneralForm.shape

(1, 19)

In [12]:
PublicSpace.shape

(9, 14)

In [13]:
WaterInf.shape

(38, 13)

In [14]:
SanitationInf.shape

(43, 14)

In [15]:
WasteManagementInf.shape

(3, 10)

In [16]:
EnergyINF.shape

(12, 38)

In [17]:
Business.shape

(21, 25)

In [18]:
MobilityINF.shape

(5, 10)

In [27]:
ComunalServices.shape

(83, 63)

In [81]:
GeneralCitizen.shape

(4, 43)

In [21]:
Shelter.shape

(28, 27)

In [84]:
FarmyardCrop

Unnamed: 0,start,end,Type_of_setlement,Type_of_Host_Community,Record_your_current_location:Latitude,Record_your_current_location:Longitude,Record_your_current_location:Altitude,Record_your_current_location:Accuracy,Item,Property,Drainage,Irrigation,Irrigation_details:Water_pump,Irrigation_details:Available,Irrigation_details:Type_of_water_pump,Irrigation_details:Power,Fertilizer,meta:instanceID
0,2019-11-16 12:32:36.147,2019-11-18 23:41:05.842,refugee_camp,,27.49574,-7.823822,521.69043,4.9,farmyard,private__insid,no,,,,,,,uuid:2d9aff6b-ba62-4d2d-926f-ac21e8e5d1e7
1,2019-11-21 12:52:59.843,2019-11-21 12:53:32.057,refugee_camp,,27.488802,-7.828352,457.556263,6.0,farmyard,public,no,,,,,,,uuid:ee881861-af08-45c8-a8da-b91633635a75
2,2019-11-21 18:24:42.63,2019-11-21 18:25:01.474,refugee_camp,,27.487468,-7.830206,452.842218,4.0,farmyard,public,no,,,,,,,uuid:0ed2e308-2fb2-4978-bc3a-b71d1dde8e1f
3,2019-11-26 10:24:55.365,2019-11-26 10:25:23.268,refugee_camp,,27.495528,-7.823089,462.861317,8.0,farmyard,private__insid,no,,,,,,,uuid:79301090-ed4b-4b91-9131-0880cb6741d8
4,2019-11-11 09:08:48.96,2019-11-11 09:09:49.417,refugee_camp,,27.497749,-7.828585,463.28656,3.216,crop_area,public,,yes,yes,yes,other,,,uuid:44c0dde7-1ffb-4f24-873a-a313c791a693
5,2019-11-11 10:21:54.813,2019-11-11 10:22:23.522,refugee_camp,,27.499611,-7.824611,468.409302,4.288,farmyard,public,no,,,,,,,uuid:86644a0c-5c75-4393-923a-5cfd6350e254
6,2019-12-02 18:24:43.885,2019-12-02 18:25:24.463,refugee_camp,,27.486246,-7.823924,457.145956,4.0,crop_area,private__insid,,no,,,,,no,uuid:fc3d784e-5195-4498-9967-dbd2658a774f
7,2019-11-27 11:16:46.185,2019-11-27 13:29:59.869,refugee_camp,,27.497384,-7.83321,466.942444,4.7,farmyard,private__insid,no,,,,,,,uuid:3c3be5b9-5395-46e0-8127-c1895e110e8c
8,2019-11-27 11:17:35.142,2019-11-27 11:17:53.068,refugee_camp,,27.497743,-7.833182,461.898041,4.9,farmyard,private__insid,no,,,,,,,uuid:f5533729-2e9f-44b3-8516-7c91bc3f8472
9,2019-11-27 12:39:20.222,2019-11-27 12:39:49.054,refugee_camp,,27.49472,-7.828418,455.605743,4.7,crop_area,public,,no,,,,,yes,uuid:edb53430-bd20-473d-be14-114b390e62de


In [23]:
PublicSpace.shape

(9, 14)

In [74]:
def get_number(df):
    df = np.array(df)
    array = np.array([])
    for column in df:
        for elem in column:
            array = np.append(array,elem)
    return pd.DataFrame(array)    

In [85]:
df1 = dfFix(FarmyardCrop,"Item","Property")
df1 = df1.isin(["farmyard"])
df2 = dfFix(FarmyardCrop,"Record_your_current_location:Latitude","Record_your_current_location:Accuracy")
df3 = dfFix(FarmyardCrop,"Property","Drainage")
df4 = dfFix(FarmyardCrop,"Irrigation","Irrigation_details:Water_pump")
df4 = df4.isin(["yes"]) #NaN != no. Revisar
FS_CropUbication = concatDF(df2,concatDF(df3,df4))
FS_CropUbication =get_valueBySector(df1,FS_CropUbication)
mkCSV(FS_CropUbication,"FS_CropUbication.csv")


In [86]:
FS_CropUbication

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,27.49574,-7.823822,521.69043,private__insid,False
1,27.488802,-7.828352,457.556263,public,False
2,27.487468,-7.830206,452.842218,public,False
3,27.495528,-7.823089,462.861317,private__insid,False
5,27.499611,-7.824611,468.409302,public,False
7,27.497384,-7.83321,466.942444,private__insid,False
8,27.497743,-7.833182,461.898041,private__insid,False
10,27.491189,-7.825525,451.051514,private__insid,False
11,40.386818,-3.699125,0.0,public,False
16,,,,private__insid,False


In [62]:
FS_ImportantMeal_has_Community

Unnamed: 0,0
0,15.0
1,15.0
2,5.0
3,15.0
4,
5,50.0
6,
7,
8,
9,
