
The objective of this project was to create an integrated database about electoral demands for specific countries and years of election. For that, it was extracted some variables of interest from Latinobarometer surveys, according to the years of interest. 

Main skills developed include:

- Data cleaning and pre-processing: raw data processing including NA treatment and standardization;
- Advanced dataframe manipulation with *Pandas* and *Numpy*;
- Advanced Survey

# Political Demand Variables - Latinobarometer database - 'Latinobarometros.csv'

In [16]:

## Importing all dataframes with its respective interest variables

anos = ['1995', '1998', '2000', '2002', '2004', '2005', '2006', '2008', '2009',
         '2010', '2011', '2013', '2015', '2016', '2017', '2020', '2023']

vars = {
    '1995' : ['pais', 'numero', 'wt', 'p31', 'p1', 'p13', 'p27i', 'p27j', 'p27m'], 
    '1998' : ['idenpa', 'numinves', 'pondera', 'sp52', 'sp1', 'sp8', 'sp38f', 'sp38g', 'sp38d'],
    '2000' : ['idenpa', 'numinves', 'wt', 'P52ST', 'P1ST', 'P12ST', 'P35ST_F', 'P35ST_G', 'P35ST_D'],
    '2002' : ['idenpa', 'numinves', 'wt', 'p64st', 'p2sta', 'p4st', 'p36std', 'p34stf', 'p34std'], 
    '2004' : ['idenpa', 'numinves', 'wt', 'p87st', 'p2st', 'p10st', 'p34stf', 'p34std', 'p34stc'],
    '2005' : ['idenpa', 'numinves', 'wt', 'p34st', 'p2st', 'p8st', 'p45sta', 'p47stb', 'p45stc'],
    '2006' : ['idenpa', 'numinves', 'wt', 'p47st', 'p2st', 'p10st', 'p24st_f', 'p24st_c', 'p24st_b'],
    '2008' : ['idenpa', 'numinves', 'wt', 'p56st', 'p4st', 'p9st', 'p28st_a', 'p28st_c', 'p31s_ta'],
    '2009' : ['idenpa', 'numinves', 'wt', 'p69st', 'p3st_a', 'p2st', 'p26st_a', 'p26st_c', 'p24st_a'],
    '2010' : ['idenpa', 'numinves', 'wt', 'P60ST', 'P3ST_A', 'P2ST', 'P20ST_A', 'P20ST_C', 'P18ST_A'],
    '2011' : ['idenpa', 'numinves', 'wt', 'P76ST', 'P3ST_A', 'P2ST', 'P22ST_A', 'P22ST_C', 'P20ST_A'],
    '2013' : ['idenpa', 'numinves', 'wt', 'P41ST', 'P3STGBS', 'P9STGBS', 'P26TGB_C', 'P26TGB_G', 'P26TGB_A'],
    '2015' : ['idenpa', 'numinves', 'wt', 'P27ST', 'P3STGBS', 'P9STGBS', 'P16ST_F', 'P19ST_C', 'P16ST_G'],
    '2016' : ['idenpa', 'numinves', 'wt', 'P17ST', 'P4STGBS', 'P3STGBS', 'P13STD', 'P13STG', 'P13STE'],
    '2017' : ['idenpa', 'numinves', 'wt', 'P19STC', 'P4STGBSC', 'P3STGBS', 'P14ST_D', 'P14ST_G', 'P14ST_E'],
    '2020' : ['idenpa', 'numinves', 'wt', 'p18st', 'p4stgbs', 'p3stgbs', 'p13st_d', 'p13st_g', 'p13st_e'],
    '2023' : ['idenpa', 'numinves', 'wt', 'P16ST', 'P5STGBS', 'P4STGBS', 'P13ST_D', 'P13ST_G', 'P13ST_I']
    }

bases_latin = {} 
for ano in anos:
    vars_dinamicas = locals()
    vars_dinamicas[f"base_{ano}"] = pd.read_stata(f"Bases finais/Bases latinobarometro/{ano}.dta", columns=vars[ano])
    vars_dinamicas[f"base_{ano}"].columns = ['pais', 'ano', 'peso', 'posi_pol', 'sit_econ',
                                                          'princ_prob', 'conf_cong', 'conf_part', 'conf_presi']
    bases_latin[f"base_{ano}"] = vars_dinamicas[f"base_{ano}"]

lbcomp = pd.concat(bases_latin.values(), ignore_index=True)

In [17]:
# Cleaning variables

##Creating and applying function to standardize and reorder categories 

#Confidence variables

def remap_category(value):
    if value in ['A lot', 'A lot of confidence', 'Mucha', 'Lot']:
        return "A lot"
    elif value in ["Algo", "Some confidence", 'Some', 'Some confidence']:
        return "Some"
    elif value in ["A little", "Little", 'Little confidence', 'Poca']:
        return "A little"
    elif value in ['Ninguna', "Some confidence", 'No confidence at all', 'No trust', 'Nothing']:
        return "No confidence at all"
    else:
        return "Don't know/No answer"
    
lbcomp['conf_cong'] = lbcomp['conf_cong'].map(remap_category)
lbcomp['conf_part'] = lbcomp['conf_part'].map(remap_category)
lbcomp['conf_presi'] = lbcomp['conf_presi'].map(remap_category)


lbcomp = lbcomp.astype({'conf_presi':'category', 'conf_cong':'category', 'conf_part':'category'})
lbcomp['conf_presi'] = lbcomp['conf_presi'].cat.reorder_categories(["A lot", "Some", "A little", "No confidence at all",
                                                                  "Don't know/No answer"])
lbcomp['conf_part'] = lbcomp['conf_part'].cat.reorder_categories(["A lot", "Some", "A little", "No confidence at all",
                                                                  "Don't know/No answer"])
lbcomp['conf_cong'] = lbcomp['conf_cong'].cat.reorder_categories(["A lot", "Some", "A little", "No confidence at all",
                                                                  "Don't know/No answer"])

In [18]:
#Political and economical variables

#Current economical situation

def remap_category_econ(value):
    if value in ['Muy buena', 'Very good']:
        return "Very Good"
    elif value in ['Good',]:
        return "Good"
    elif value in ['About average', 'Regular' ]:
        return "About Average"
    elif value in ['Bad', 'Mala']:
        return "Bad"
    elif value in ['Muy Mala', 'Very bad']:
        return "Very Bad"
    else:
        return "Don't know/No answer"
    
lbcomp['sit_econ'] = lbcomp['sit_econ'].map(remap_category_econ)
lbcomp['sit_econ'] = lbcomp['sit_econ'].astype('category')
lbcomp['sit_econ'] = lbcomp['sit_econ'].cat.reorder_categories(["Very Good", "Good", "About Average", "Bad", "Very Bad", 
                                                                "Don't know/No answer"])

In [19]:
#Political position

def remap_category_posi(value):
    if value in ['Right', 'Derecha', '10 RIGHT', '10 Right', '10. Right']:
        return "10"
    elif value in ['0. Left', '00 LEFT', '00 Left', 'Izquierda', 'Left',  ]:
        return "0"
    elif value in ['6', -6]:
        return "6"
    elif value in ['Does not answer', "Don't know", 'Don´t answer', 'Don´t know', 'DonÂ´t know', 'Ninguno', 'No answer',
 'No answer/Refused', 'No responde', 'No sabe', 'None']:
        return "Don't know/No answer"
    else:
        return value
    
lbcomp['posi_pol'] = lbcomp['posi_pol'].map(remap_category_posi)
lbcomp['posi_pol'] = lbcomp['posi_pol'].astype('category')
lbcomp['posi_pol'] = lbcomp['posi_pol'].cat.reorder_categories(["0", "1", "2", "3", "4", '5', '6', '7', '8', '9', '10', 
                                                                "Don't know/No answer"])

In [20]:

#Main country problem

def remap_category_prob(value):
    if value in ['All expensive', 'Bajos salarios', 'Deficient basic services (water, electricity, ...)',
                'Desabastecimiento/Falta de alimentos/Acaparamiento', 'DesocupaciÃ³n / desempleo', 
                'DistribuciÃ³n del ingreso, injusticia social', 'Economy/ Economic problems/Financial problems', 
                'Economy/ Economic problems/financial problems', 'Economy/economical problems/financial', 
                'Employment inestability', 'Employment instability', 'Everything is very expensive', 'Food, scarcity',
                'Gas, fuel, scarcity, (high prices)', 'Income distribution', 'Income inequality, social injustice',
                'Inestabilidad en el empleo', 'InflaciÃ³n / aumento de precios', 'Inflation/ price rises',
                'Inflation/ price rises / Economic Crisis', 'Inflation/ raise of prices', 'Inflation/rise in prices',
                'Instability in employment', 'La economÃ\xada/problemas econÃ³micos/financieros', 'Low Salaries',
                'Low salaries', 'Pobreza', 'Poverty', 'Poverty / Social Inequality', 'Shortages / Lack of food / Hoarding',
                'Unemployment', 'shortage of groceries /Hoarding/ food is missing']:
        return "Economia e desenvolvimento"
    elif value in ['A lot of dirt', 'CorrupciÃ³n', 'Corruption', 'Dirtiness, lack of cleanliness']:
        return "Corrupção, governança e transparência"
    elif value in ['Coronavirus pandemic/ Covid-19', 'Drug consumption / addiction',
                  'Drugs consumption /addiction', 'Health', 'Health Problems', 'Health issues', 'Health problems', 
                  'Pandemic/Coronavirus/Covid-19', 'Problemas de la salud']:
        return "Saúde e bem-estar"
    elif value in ['Crime', 'Crime / Public security', 'Crime /Public Security', 'Crime/public security',
                   'Delincuencia / seguridad pÃºblica', 'Delinquency / public security', 'Drug trafficking', 
                   'Guerrilla warfare', 'Judicial System/ justice', 'Narcotrafic', 'NarcotrÃ¡fico', 'Terrorism',
                   'Terrorism / Political violence / Guerrilla', 'Terrorism/ war', 'Terrorism/political violence/guerrilla',
                   'Terrorismo/guerrilla', 'Verbal abuse', 'Verbal violence', 'Violence / gangs', 'Violence, bands',
                   'Violence/gangs', 'Violencia/pandillas']:
        return "Segurança e justiça"
    elif value in ['Border conflicts', 'Border issues', 'Border problems', 'Conflicts with neighboring countries', 
                   'Foreigners', 'Foreing people', 'Immigrants', 'Problemas con paÃ\xadses vecinos', 
                   'Problemas limÃ\xadtrofes', 'Problems with neighbor countries']:
        return "Política externa e relações internacionais"
    elif value in ['DiscriminaciÃ³n racial', 'Discrimination by race', 'Domestic violence', 'Racial Discrimination',
                   'Racial discrimination']:
        return "Igualdade e inclusão"
    elif value in []:
        return "Valores e tradição"
    elif value in ['Education', 'Education problems', 'Problemas de la educaciÃ³n', ]:
        return "Educação e conhecimento"
    elif value in ['Calentamiento global', 'Environment', 'Environment problems', 'Environmental problems', 'Global heating', 
                   'Global warming', 'Pollution', 'Problemas del medio ambiente/contaminaciÃ³n', 'Contamination', 
                   'environmental issues']:
        return "Meio ambiente e sustentabilidade"
    elif value in ['Instability in employment / Immigration']:
        return "Múltiplos temas"
    elif value in ['Does not answer', "Don't know", 'DonÂ´t know', 'Ninguno', 'No answer', 'No answer/Refused', 'No problem',
                   'No problems', 'No responde', 'No sabe', 'None', 'Not asked', 'Not being able to be in my country']:
        return "Não sabe/ Não respondeu"
    else:
        return "Outros temas"
    
lbcomp['princ_prob'] = lbcomp['princ_prob'].map(remap_category_prob)
lbcomp['princ_prob'] = lbcomp['princ_prob'].astype('category')


In [21]:
lbcomp['princ_prob'] = lbcomp['princ_prob'].cat.reorder_categories(["Economia e desenvolvimento",
                                                                    "Corrupção, governança e transparência",
                                                                    "Saúde e bem-estar", "Segurança e justiça",
                                                                    "Política externa e relações internacionais", 
                                                                    "Igualdade e inclusão", 
                                                                    "Educação e conhecimento",
                                                                    "Meio ambiente e sustentabilidade",
                                                                    "Múltiplos temas", "Outros temas",
                                                                    "Não sabe/ Não respondeu"])

lbcomp.head()

Unnamed: 0,pais,ano,peso,posi_pol,sit_econ,princ_prob,conf_cong,conf_part,conf_presi
0,Chile,1995,0.90782,5,Good,Segurança e justiça,Some,A little,Some
1,Chile,1995,0.90782,9,About Average,Economia e desenvolvimento,Some,Some,A little
2,Chile,1995,0.90782,5,Bad,Economia e desenvolvimento,A little,A little,Some
3,Chile,1995,0.90782,0,About Average,Economia e desenvolvimento,A little,Some,Some
4,Chile,1995,0.90782,5,About Average,Economia e desenvolvimento,Some,A little,A little


In [22]:
# Filtering to the country and years matching the offer database

In [23]:
met = pd.read_csv('Bases finais/metabaseeclassificacao.csv', dtype={'tipo_apelo':'category', 'emocao_apelo':'category', 
                                                                    'tom_tempo_apelo':'category'})

a = met[['pais', 'ano_eleicao']]
a = a.drop_duplicates()
a.sort_values('ano_eleicao', inplace = True)
a = a.reset_index(drop=True)

a.head()

Unnamed: 0,pais,ano_eleicao
0,Argentina,1983
1,Argentina,1989
2,Argentina,1995
3,Argentina,1999
4,México,2000


In [29]:
lbcomp.sort_values('ano', inplace = True)
lbcomp['ano'] = lbcomp['ano'].astype('int64')

#Removing blanckspaces before and after country names and standardizing them
lbcomp['pais'] = lbcomp['pais'].str.strip()
lbcomp["pais"] = lbcomp["pais"].replace(['Brazil', 'Ecuador', 'Mexico', 'Uruguay', 'Nicaragua', 'Venezuela, RB', 
                                                 'Venezuela (Bolivarian Republic of)', 'MÃ©xico', 'PerÃº'], 
                                                ['Brasil', 'Equador', 'México', 'Uruguai', 'Nicarágua', 'Venezuela', 
                                                 'Venezuela', 'México', 'Peru'])

#Merging political offer database with demand database -1 year
ten2 = pd.merge_asof(lbcomp, a, tolerance = 1, left_on='ano', right_on='ano_eleicao',
                    by='pais', allow_exact_matches=False, direction='forward')
ten2 = ten2.dropna(subset=['ano_eleicao']).reset_index(drop=True)

#Joining vertically countries by the same year in the 2 databases and creating year of the survey and election variables

df = pd.concat([ten2, lbcomp[(lbcomp['pais']=='Argentina') & (lbcomp['ano']==1995)],
                lbcomp[(lbcomp['pais']=='Argentina') & (lbcomp['ano']==2015)],
                lbcomp[(lbcomp['pais']=='Chile') & (lbcomp['ano']==2013)],
                lbcomp[(lbcomp['pais']=='Equador') & (lbcomp['ano']==2013)],
                lbcomp[(lbcomp['pais']=='Honduras') & (lbcomp['ano']==2013)],
                lbcomp[(lbcomp['pais']=='México') & (lbcomp['ano']==2000)],
                 lbcomp[(lbcomp['pais']=='Venezuela') & (lbcomp['ano']==2013)]], ignore_index=True)

conditions = [
    (df['pais'] == 'Argentina') & (df['ano'] == 1995),
    (df['pais']=='Argentina') & (df['ano']==2015),
    (df['pais']=='Chile') & (df['ano']==2013),
    (df['pais']=='Equador') & (df['ano']==2013),
    (df['pais']=='Honduras') & (df['ano']==2013),
    (df['pais'] == 'México') & (df['ano'] == 2000),
    (df['pais']=='Venezuela') & (df['ano']==2013),
    
]

choices = [df['ano'], df['ano'], df['ano'], df['ano'], df['ano'], df['ano'], df['ano']]

df['ano_eleicao'] = np.select(conditions, choices, default=df['ano_eleicao'])

df = df.sort_values('ano').reset_index(drop=True)
df.insert(2, 'ano_eleicao', df.pop('ano_eleicao'))
df = df.astype({'ano_eleicao':'Int64'})
df.rename(columns={'ano':'ano_survey'}, inplace = True)
df["pais"] = df["pais"].replace(['Panama', 'Colombia'], 
                                ['Panamá', 'Colômbia'])
#df.to_csv('Bases finais/basedemandaeleitoral.csv', index=False)
print(df.shape)
df.pais.unique()

(43178, 10)


array(['Argentina', 'México', 'Chile', 'Equador', 'Brasil', 'Colômbia',
       'Uruguai', 'Panamá', 'Honduras', 'Costa Rica', 'Peru', 'Nicarágua',
       'Guatemala', 'Venezuela'], dtype=object)

In [5]:
dem = pd.read_csv("Bases finais/basedemandaeleitoral.csv", dtype={'pais':'category', 'sit_econ':'category','princ_prob':'category',
                                                                  'conf_cong':'category', 'conf_part':'category', 
                                                                  'conf_presi':'category', 'peso':'float64', 'ano_eleicao':'category'})

pais           category
ano_survey        int64
ano_eleicao       int64
peso             object
posi_pol         object
sit_econ       category
princ_prob     category
conf_cong      category
conf_part      category
conf_presi     category
dtype: object