### Code for processing Freedom House data

### Sanittawan Nikki Tan

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
fh = pd.read_excel('./2018_FH_ratings.xlsx', sheet_name=1, skiprows=[0], header=[0,1])

In [3]:
fh.head()

Unnamed: 0_level_0,Year(s) Under Review,1972,1972,1972,1973,1973,1973,1974,1974,1974,...,2014,2015,2015,2015,2016,2016,2016,2017,2017,2017
Unnamed: 0_level_1,Unnamed: 0_level_1,PR,CL,Status,PR,CL,Status,PR,CL,Status,...,Status,PR,CL,Status,PR,CL,Status,PR,CL,Status
0,Afghanistan,4,5,PF,7,6,NF,7,6,NF,...,NF,6,6,NF,6,6,NF,5,6,NF
1,Albania,7,7,NF,7,7,NF,7,7,NF,...,PF,3,3,PF,3,3,PF,3,3,PF
2,Algeria,6,6,NF,6,6,NF,6,6,NF,...,NF,6,5,NF,6,5,NF,6,5,NF
3,Andorra,4,3,PF,4,4,PF,4,4,PF,...,F,1,1,F,1,1,F,1,1,F
4,Angola,-,-,-,-,-,-,-,-,-,...,NF,6,6,NF,6,6,NF,6,6,NF


In [4]:
# ref: https://stackoverflow.com/questions/14507794/pandas-how-to-flatten-a-hierarchical-index-in-columns
multi_index = fh.columns
mi_list = multi_index.tolist()
mi_list
ind = pd.Index([str(e[1]).strip() + str(e[0]).strip() for e in mi_list])
fh.columns = ind

In [5]:
fh.columns

Index(['Unnamed: 0_level_1Year(s) Under Review', 'PR1972', 'CL1972',
       'Status1972', 'PR1973', 'CL1973', 'Status1973', 'PR1974', 'CL1974',
       'Status1974',
       ...
       'Status2014', 'PR2015', 'CL2015', 'Status2015', 'PR2016', 'CL2016',
       'Status2016', 'PR2017', 'CL2017', 'Status2017'],
      dtype='object', length=136)

In [6]:
fh = fh.rename(columns={'Unnamed: 0_level_1Year(s) Under Review':'country'})

In [7]:
fh.head()

Unnamed: 0,country,PR1972,CL1972,Status1972,PR1973,CL1973,Status1973,PR1974,CL1974,Status1974,...,Status2014,PR2015,CL2015,Status2015,PR2016,CL2016,Status2016,PR2017,CL2017,Status2017
0,Afghanistan,4,5,PF,7,6,NF,7,6,NF,...,NF,6,6,NF,6,6,NF,5,6,NF
1,Albania,7,7,NF,7,7,NF,7,7,NF,...,PF,3,3,PF,3,3,PF,3,3,PF
2,Algeria,6,6,NF,6,6,NF,6,6,NF,...,NF,6,5,NF,6,5,NF,6,5,NF
3,Andorra,4,3,PF,4,4,PF,4,4,PF,...,F,1,1,F,1,1,F,1,1,F
4,Angola,-,-,-,-,-,-,-,-,-,...,NF,6,6,NF,6,6,NF,6,6,NF


In [8]:
fh.columns

Index(['country', 'PR1972', 'CL1972', 'Status1972', 'PR1973', 'CL1973',
       'Status1973', 'PR1974', 'CL1974', 'Status1974',
       ...
       'Status2014', 'PR2015', 'CL2015', 'Status2015', 'PR2016', 'CL2016',
       'Status2016', 'PR2017', 'CL2017', 'Status2017'],
      dtype='object', length=136)

In [9]:
fh["id"] = fh.index
fh = pd.wide_to_long(fh, ["PR", "CL", "Status"], i=["id", "country"], j="year", suffix='[\w\d\W.-]+')

In [10]:
fh.index.get_level_values(level=2)

Index(['1972', '1973', '1974', '1975', '1976', '1977', '1978', '1979', '1980',
       'Jan.1981-Aug. 1982',
       ...
       '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017'],
      dtype='object', name='year', length=9270)

In [11]:
# https://stackoverflow.com/questions/40855900/pandas-rename-index-values
fh = fh.rename({'Jan.1981-Aug. 1982': '1982',
            'Aug.1982-Nov.1983': '1983',
            'Nov.1983-Nov.1984': '1984',
            'Nov.1984-Nov.1985': '1985',
            'Nov.1985-Nov.1986': '1986',
            'Nov.1986-Nov.1987': '1987',
            'Nov.1987-Nov.1988': '1988',
            'Nov.1988-Dec.1989': '1989'}, axis='index', level=2)

In [12]:
final_fh = pd.DataFrame(fh.to_records())
final_fh = final_fh.astype(dtype={'year': 'int64'})

In [13]:
final_fh = final_fh.drop(final_fh[final_fh['year'] < 2008].index)

In [14]:
final_fh = final_fh[(final_fh['PR'] != '-') | (final_fh['CL'] != '-')]

In [15]:
final_fh = final_fh.drop('id', axis=1)

In [16]:
final_fh['Status'] = final_fh['Status'].apply(lambda x: x.strip())

In [17]:
final_fh = final_fh.astype(dtype={'PR': 'int64',
                                  'CL': 'int64'})

In [18]:
final_fh['country'] = final_fh['country'].apply(lambda x: x.strip())

In [19]:
final_fh['Status'].unique()

array(['NF', 'PF', 'F'], dtype=object)

In [20]:
final_fh['FH_category'] = 0

In [21]:
final_fh['FH_category'].loc[final_fh['Status'] == 'F'] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [22]:
final_fh['FH_category'].loc[final_fh['Status'] == 'PF'] = 1

In [27]:
dummies = pd.get_dummies(final_fh['Status'], prefix='FH')

In [28]:
final_fh = pd.concat([final_fh, dummies], axis=1)

In [30]:
final_fh.head()

Unnamed: 0,country,year,PR,CL,Status,FH_category,FH_F,FH_NF,FH_PF
35,Afghanistan,2008,5,6,NF,0,0,1,0
36,Afghanistan,2009,6,6,NF,0,0,1,0
37,Afghanistan,2010,6,6,NF,0,0,1,0
38,Afghanistan,2011,6,6,NF,0,0,1,0
39,Afghanistan,2012,6,6,NF,0,0,1,0


In [31]:
final_fh.to_csv('./cleaned_FH.csv', index=False)