In [1]:
from warnings import filterwarnings

filterwarnings('ignore')

In [2]:
from copy import deepcopy
import pandas as pd
import numpy as np
from more_itertools import flatten

In [3]:
pd.set_option('display.max_columns', None)

In [4]:
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [5]:
trade_df = pd.read_csv('data/dyadic_trade_3.0.csv', encoding = 'utf8')

trade_df.rename({'flow1': 'money_flow_1', 'flow2': 'money_flow_2',
         'ccode1': 'c_code_1', 'ccode2': 'c_code_2',
         'importer1': 'state_name_1', 'importer2': 'state_name_2'},
        axis = 1, inplace = True)

trade_df['c_code_1'] = trade_df['c_code_1'].astype(int)
trade_df['c_code_2'] = trade_df['c_code_2'].astype(int)

trade_df['money_flow_1'] = ([s * 1000000 for s in trade_df['money_flow_1']])
trade_df['money_flow_2'] = ([s * 1000000 for s in trade_df['money_flow_2']])

trade_df.loc[trade_df['money_flow_1'] == -9000000, 'money_flow_1'] = None
trade_df.loc[trade_df['money_flow_2'] == -9000000, 'money_flow_2'] = None

trade_df = trade_df.sort_values(by = 'year', ascending = True).reset_index()

trade_df.drop(['source1', 'source2', 'bel_lux_alt_flow1', 'bel_lux_alt_flow2', 'china_alt_flow1', 'china_alt_flow2', 'version', 'index'], axis = 1, inplace = True)

In [6]:
trade_df_copy = deepcopy(trade_df)

trade_df_copy.rename({'c_code_1': 'c_code_1_copy', 'c_code_2':'c_code_2_copy',
         'state_name_1': 'state_name_1_copy', 'state_name_2': 'state_name_2_copy',
         'money_flow_1': 'money_flow_1_copy', 'money_flow_2': 'money_flow_2_copy'},
        axis = 1, inplace = True)

trade_df_copy.rename({'c_code_1_copy': 'c_code_2', 'c_code_2_copy':'c_code_1',
         'state_name_1_copy': 'state_name_2', 'state_name_2_copy': 'state_name_1',
         'money_flow_1_copy': 'money_flow_2', 'money_flow_2_copy': 'money_flow_1'},
        axis = 1, inplace = True)

In [7]:
trade_df = pd.concat([trade_df, trade_df_copy], axis = 0)

trade_df = trade_df.sort_values(by = 'year', ascending = True).reset_index()

trade_df.drop(['index'], axis = 1, inplace = True)

In [8]:
trade_df.head()

Unnamed: 0,c_code_1,c_code_2,money_flow_1,money_flow_2,state_name_1,state_name_2,year
0,210,390,,,Netherlands,Denmark,1870
1,255,160,190000.0,,Germany,Argentina,1870
2,200,70,1450000.0,5110000.0,United Kingdom,Mexico,1870
3,600,255,,,Morocco,Germany,1870
4,350,135,,,Greece,Peru,1870


In [9]:
aggregations = {
  'c_code_2': 'count',
  'money_flow_1': 'sum',
  'money_flow_2': 'sum'
  }

trade_df_group = trade_df.groupby(['c_code_1', 'state_name_1', 'year']).agg(aggregations).reset_index()

trade_df_group.rename({'c_code_2': 'num_trade_states', 'money_flow_1': 'export_dollars', 'money_flow_2': 'import_dollars'}, axis = 1, inplace = True)

In [10]:
trade_df_group.to_pickle('pickle/trade_df_group.pkl')

In [11]:
trade_df_group.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars
0,2,United States of America,1870,36,256170000.0,359400000.0
1,2,United States of America,1871,35,319960000.0,430510000.0
2,2,United States of America,1872,32,313530000.0,353040000.0
3,2,United States of America,1873,32,392890000.0,542810000.0
4,2,United States of America,1874,32,317460000.0,553420000.0


In [12]:
mat_cap_df = pd.read_csv('data/NMC_5_0-wsupplementary.csv', encoding = 'latin-1')

mat_cap_df.rename({'milex': 'military_expenditure', 'milper': 'military_personnel',
          'irst': 'iron_steel_prod', 'pec': 'prim_energy_consumption',
          'tpop': 'total_pop', 'upop': 'urban_pop',
          'upopgrowth': 'urban_pop_growth_rate', 'ccode': 'c_code_1',
          'statenme': 'state_name_1', 'cinc': 'cinc_score'}, axis = 1, inplace = True)

mat_cap_df['military_expenditure'] = ([s * 1000 for s in mat_cap_df['military_expenditure']])
mat_cap_df['military_personnel'] = ([s * 1000 for s in mat_cap_df['military_personnel']])
mat_cap_df['total_pop'] = ([s * 1000 for s in mat_cap_df['total_pop']])
mat_cap_df['urban_pop'] = ([s * 1000 for s in mat_cap_df['urban_pop']])
mat_cap_df['iron_steel_prod'] = ([s * 2000000 for s in mat_cap_df['iron_steel_prod']])
mat_cap_df['prim_energy_consumption'] = ([s * 2000000 for s in mat_cap_df['prim_energy_consumption']])

mat_cap_df.loc[mat_cap_df['military_expenditure'] == -9000, 'military_expenditure'] = None
mat_cap_df.loc[mat_cap_df['military_personnel'] == -9000, 'military_personnel'] = None
mat_cap_df.loc[mat_cap_df['total_pop'] == -9000, 'total_pop'] = None
mat_cap_df.loc[mat_cap_df['urban_pop'] == -9000, 'urban_pop'] = None
mat_cap_df.loc[mat_cap_df['iron_steel_prod'] == -18000000 , 'iron_steel_prod'] = None
mat_cap_df.loc[mat_cap_df['prim_energy_consumption'] == -18000000 , 'prim_energy_consumption'] = None

mat_cap_df = mat_cap_df.sort_values(by = 'year', ascending = True).reset_index()

mat_cap_df.drop(['stateabb', 'milpersource', 'milpernote', 'milexsource', 'milexnote',
         'irstsource', 'irstnote', 'pecsource', 'pecnote', 'pecnote',
         'tpopsource', 'tpopnote', 'upopsource', 'upopnote', 'upopgrowthsource',
         'irstqualitycode', 'irstanomalycode', 'pecqualitycode', 'pecanomalycode',
         'tpopqualitycode', 'tpopanomalycode', 'upopqualitycode', 'upopanomalycode',
         'version', 'urban_pop_growth_rate', 'index'], axis = 1, inplace = True)

In [13]:
mat_cap_df.to_pickle('pickle/mat_cap_df.pkl')

In [14]:
mat_cap_df.head()

Unnamed: 0,state_name_1,c_code_1,year,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score
0,United States of America,2,1816,3823000.0,17000.0,160000000.0,508000000.0,8659000.0,101000.0,0.04
1,Spain,230,1816,6512000.0,125000.0,20000000.0,0.0,11073000.0,221000.0,0.05
2,Netherlands,210,1816,2375000.0,26000.0,100000000.0,2284000000.0,5610000.0,337000.0,0.04
3,Portugal,235,1816,,22000.0,0.0,0.0,2746000.0,179000.0,0.01
4,United Kingdom,200,1816,16942000.0,255000.0,540000000.0,45056000000.0,19520000.0,1957000.0,0.34


In [15]:
alliance_df = pd.read_csv('data/alliance_v4.1_by_directed_yearly.csv', encoding = 'utf8')

alliance_df.rename({'left_censor': 'pre_1816_alliance', 'right_censor': 'in_effect_1231_2012',
          'neutrality': 'neutrality_treaty', 'nonaggression': 'nonaggression_treaty',
          'defense': 'defense_treaty', 'entente': 'entente_treaty',
          'ccode1': 'c_code_1', 'ccode2': 'c_code_2', 'state_name1': 'state_name_1',
          'state_name2': 'state_name_2'}, axis = 1, inplace = True)

alliance_df['alliance_date_start'] = pd.to_datetime((alliance_df['dyad_st_year']*10000+alliance_df['dyad_st_month']*100+alliance_df['dyad_st_day']).apply(str),format='%Y%m%d')
alliance_df['alliance_date_end'] = pd.to_datetime((alliance_df['dyad_end_year']*10000+alliance_df['dyad_end_month']*100+alliance_df['dyad_end_day']).apply(str),format='%Y%m%d')

alliance_df = alliance_df.sort_values(by = 'year', ascending = True).reset_index()

alliance_df.drop(['dyad_st_day', 'dyad_st_month', 'dyad_st_year', 'dyad_end_day',
         'dyad_end_month', 'dyad_end_year', 'version4id', 'version', 'index'],
         axis = 1, inplace = True)

In [16]:
alliance_df.head()

Unnamed: 0,c_code_1,state_name_1,c_code_2,state_name_2,pre_1816_alliance,in_effect_1231_2012,defense_treaty,neutrality_treaty,nonaggression_treaty,entente_treaty,year,alliance_date_start,alliance_date_end
0,200,United Kingdom,235,Portugal,1,1,1,0,1.0,0.0,1816,1816-01-01,NaT
1,275,Hesse Grand Ducal,300,Austria-Hungary,1,0,1,0,1.0,1.0,1816,1816-01-01,1848-03-15
2,275,Hesse Grand Ducal,273,Hesse Electoral,1,0,1,0,1.0,1.0,1816,1816-01-01,1848-03-15
3,275,Hesse Grand Ducal,271,Wuerttemburg,1,0,1,0,1.0,1.0,1816,1816-01-01,1848-03-15
4,275,Hesse Grand Ducal,269,Saxony,1,0,1,0,1.0,1.0,1816,1816-01-01,1848-03-15


In [17]:
aggregations = {
  'c_code_2': 'count',
  'pre_1816_alliance': 'sum',
  'in_effect_1231_2012': 'sum',
  'defense_treaty': 'sum',
  'neutrality_treaty': 'sum',
  'nonaggression_treaty': 'sum',
  'entente_treaty': 'sum'
  }

alliance_df_group = alliance_df.groupby(['c_code_1', 'state_name_1', 'year']).agg(aggregations).reset_index()

alliance_df_group = alliance_df_group.sort_values(by = 'year', ascending = True).reset_index()

alliance_df_group.rename({'c_code_2': 'num_alliances', 'pre_1816_alliance': 'pre_1816_alliances',
             'in_effect_1231_2012': 'num_in_effect_1231_2012',
             'defense_treaty': 'defense_treaties',
             'neutrality_treaty': 'neutrality_treaties',
             'nonaggression_treaty': 'nonaggression_treaties',
             'entente_treaty': 'entente_treaties'}, axis = 1, inplace = True)

alliance_df_group.drop(['index'], axis = 1, inplace = True)

In [18]:
alliance_df_group.to_pickle('pickle/alliance_df_group.pkl')

In [19]:
alliance_df_group.head()

Unnamed: 0,c_code_1,state_name_1,year,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties
0,230,Spain,1816,1,0,0,1,0,0.0,0.0
1,255,Germany,1816,10,10,0,10,0,7.0,10.0
2,267,Baden,1816,7,7,0,7,0,7.0,7.0
3,271,Wuerttemburg,1816,7,7,0,7,0,7.0,7.0
4,380,Sweden,1816,1,1,0,0,0,0.0,1.0


In [20]:
contiguity_df = pd.read_csv('data/contdird.csv', encoding = 'utf8')

contiguity_df.rename({'state1no': 'c_code_1', 'state2no': 'c_code_2',
           'state1ab': 'state_name_abb_1', 'state2ab': 'state_name_abb_2',
           'conttype': 'contiguity_type'}, axis = 1, inplace = True)
         
contiguity_df.drop(['dyad', 'version'], axis = 1, inplace = True)

In [21]:
contiguity_df.to_pickle('pickle/contiguity_df.pkl')

In [22]:
contiguity_df.head()

Unnamed: 0,c_code_1,state_name_abb_1,c_code_2,state_name_abb_2,year,contiguity_type
0,2,USA,20,CAN,1920,1
1,2,USA,20,CAN,1921,1
2,2,USA,20,CAN,1922,1
3,2,USA,20,CAN,1923,1
4,2,USA,20,CAN,1924,1


In [23]:
mid_df = pd.read_csv('data/dyadic MIDs 3.1.csv', encoding = 'utf8')
  
mid_df.rename({'revstata': 'revision_sought', 'revtypea': 'revision_type',
        'fatleva': 'state_fatality_bin', 'highmcaa': 'highest_mca',
        'hihosta': 'highest_hostility', 'durindx': 'year_num',
        'midc3hia': 'highest_action_for_state', 'orignata': 'origin_participant',
        'notarg': 'num_against', 'settlmnt': 'settlement_type',
        'fatlev': 'total_fatality_bin', 'noinit': 'num_with',
        'disno': 'dispute_id', 'statea': 'c_code_1', 'stateb': 'c_code_2',
        'namea': 'state_name_abb_a', 'nameb': 'state_name_abb_b',
        'cumdurat': 'cumulative_duration', 'ongo2010': 'ongoing_2010'}, axis = 1, inplace = True)

mid_df['date_start'] = pd.to_datetime((mid_df['strtyr']*10000+mid_df['strtmnth']*100+mid_df['strtday']).apply(str),format='%Y%m%d')
mid_df['date_end'] = pd.to_datetime((mid_df['endyear']*10000+mid_df['endmnth']*100+mid_df['endday']).apply(str),format='%Y%m%d')

mid_df.loc[mid_df['sideaa'] == 1, 'initiator'] = 1
mid_df.loc[mid_df['sideaa'] != 1, 'initiator'] = 0

mid_df = mid_df.sort_values(by = 'year', ascending = True).reset_index()

mid_df.drop(['strtmnth', 'strtday', 'strtyr', 'endmnth', 'endday',
       'endyear', 'dyindex', 'duration', 'disno4', 'sideaa',
       'sideab', 'revstatb', 'revtypeb', 'fatlevb', 'highmcab',
       'hihostb', 'hihost', 'orignatb', 'recip', 'rolea', 'roleb',
       'mid3hiact', 'mid3hib', 'change', 'changetype_1',
       'changetype_2', 'highact', 'new', 'mid3hia', 'index'],
      axis = 1, inplace = True)

In [24]:
mid_df.head()

Unnamed: 0,dispute_id,c_code_1,state_name_abb_a,c_code_2,state_name_abb_b,year,outcome,settlement_type,total_fatality_bin,num_with,num_against,revision_sought,revision_type,state_fatality_bin,highest_mca,highest_hostility,origin_participant,war,year_num,cumulative_duration,ongoing_2010,date_start,date_end,initiator
0,3239,230,SPN,2,USA,1816,2,3,0,1,1,0,0,0,1,1,1,0,1,1,0,1816-07-27,1816-07-27,0.0
1,3239,2,USA,230,SPN,1816,1,3,0,1,1,1,2,0,19,4,1,0,1,1,0,1816-07-27,1816-07-27,1.0
2,3321,365,RUS,640,TUR,1817,4,1,0,1,1,1,1,0,7,3,1,0,1,51,0,1817-09-12,1817-11-01,1.0
3,3321,640,TUR,365,RUS,1817,3,1,0,1,1,0,0,0,13,3,1,0,1,51,0,1817-09-12,1817-11-01,0.0
4,1567,230,SPN,2,USA,1818,2,1,-9,1,1,0,0,-9,19,4,1,0,1,77,0,1818-03-15,1818-05-30,0.0


In [25]:
aggregations = {
  'c_code_2': 'count',
  'num_with': 'sum',
  'num_against': 'sum',
  'cumulative_duration': 'sum',
  'war': 'sum',
  'origin_participant': 'mean',
  'ongoing_2010': 'mean',
  'revision_sought': 'mean',
  'revision_type': pd.Series.mode,
  'state_fatality_bin': pd.Series.mode,
  'outcome': pd.Series.mode,
  'settlement_type': pd.Series.mode,
  'highest_mca': pd.Series.mode,
  'highest_hostility': pd.Series.mode
  }

mid_df_group = mid_df.groupby(['c_code_1', 'state_name_abb_a', 'year']).agg(aggregations).reset_index()

for i, country in enumerate(mid_df_group['c_code_1']):
    mid_df_group.loc[i, 'num_with'] = mid_df_group['num_with'][i]/mid_df_group['c_code_2'][i]
    mid_df_group.loc[i, 'num_against'] = mid_df_group['num_against'][i]/mid_df_group['c_code_2'][i]
    mid_df_group.loc[i, 'cumulative_duration'] = mid_df_group['cumulative_duration'][i]/mid_df_group['c_code_2'][i]
    mid_df_group.loc[i, 'cumulative_duration'] = mid_df_group['cumulative_duration'][i]/365

mid_df_group = mid_df_group.sort_values(by = 'year', ascending = True).reset_index()

mid_df_group.rename({'c_code_2': 'num_conflicts', 'num_with': 'avg_with', 'num_against': 'avg_against',
           'cumulative_duration': 'avg_cum_duration', 'war': 'num_wars',
           'origin_participant': 'origin_participant_pct', 'revision_sought': 'revision_pct',
          'revision_type': 'revision_type_mode', 'state_fatality_bin': 'state_fatality_bin_mode',
           'outcome': 'outcome_mode', 'settlement_type': 'settlement_type_mode',
           'highest_mca': 'highest_mca_mode', 'highest_hostility': 'highest_hostility_mode'}, axis = 1, inplace = True)

mid_df_group.drop(['index'], axis = 1, inplace = True)

In [26]:
mid_df_group.to_pickle('pickle/mid_df_group.pkl')

In [27]:
mid_df_group.head()

Unnamed: 0,c_code_1,state_name_abb_a,year,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode
0,2,USA,1816,1,1.0,1.0,0.0,0,1.0,0.0,1.0,2,0,1,3,19,4
1,230,SPN,1816,1,1.0,1.0,0.0,0,1.0,0.0,0.0,0,0,2,3,1,1
2,365,RUS,1817,1,1.0,1.0,0.14,0,1.0,0.0,1.0,1,0,4,1,7,3
3,640,TUR,1817,1,1.0,1.0,0.14,0,1.0,0.0,0.0,0,0,3,1,13,3
4,2,USA,1818,1,1.0,1.0,0.21,0,1.0,0.0,1.0,1,-9,1,1,16,4


In [28]:
gov_df = pd.read_csv('data/ddrevisited_data_v1.csv', encoding = 'latin-1')

gov_df.rename({'chgterr': 'territory_change', 'ychgterr': 'territory_change_year',
        'entryy': 'first_recorded_year', 'exity': 'last_recorded_year',
        'bornyear': 'born_year', 'endyear': 'died_year',
        'exselec': 'election_type', 'legselec': 'legislation_type',
        'closed': 'legislature_status', 'dejure': 'party_legal_status',
        'defacto': 'party_existance', 'defacto2': 'party_existance_outside_regime', 
        'lparty': 'legislature_parties', 'incumb': 'incumbent_type',
        'collect': 'collective_leadership', 'eheads': 'num_leadership_changes',
        'ehead': 'leader_name', 'epost': 'post_name', 'edate': 'entrance_date',
        'ageeh': 'leader_tenure', 'emil': 'military_leader',
        'royal': 'royal_leader', 'comm': 'communist_leader',
        'edeath': 'leader_died', 'democracy': 'democratic_regime',
        'assconfid': 'cabinet_assembly', 'poppreselec': 'popular_election',
        'regime': 'regime_type', 'ttd': 'transition_democracy',
        'tta': 'transition_dictatorship', 'agedem': 'age_govt',
        'stra': 'num_transitions_ever', 'cowcode': 'c_code_1',
        'ctryname': 'state_name_1', 'headdiff': 'nominal_vs_eff_diff',
        'cowcode2': 'c_code_2'},
       axis = 1, inplace = True)

gov_df.drop(['order', 'aclpcode', 'c_code_2', 'qogctycode', 'qogctylett',
       'qogctyyear', 'ccdcodelet', 'ccdcodenum', 'aclpyear',
       'cowcode2year', 'cowcodeyear', 'flagc_cowcode2', 'flage_cowcode2',
       'imf_code', 'politycode', 'bankscode', 'dpicode', 'uncode', 'un_region',
       'un_region_name', 'un_continent', 'last_recorded_year',
       'dupcow', 'dupwdi', 'dupun', 'dupdpi', 'dupimf', 'dupbanks',
       'cid', 'wdicode', 'un_continent_name', 'aclp_region', 'type2',
       'nheads', 'nmil', 'nhead', 'npost', 'ndate', 'entrance_date',
       'tenure08', 'ecens08', 'flageh', 'tt', 'flagc', 'flagdem',
       'flagreg', 'agereg', 'first_recorded_year', 'born_year',
       'died_year', 'leader_name', 'post_name'], axis = 1, inplace = True)

In [29]:
gov_df.to_pickle('pickle/gov_df.pkl')

In [30]:
gov_df.head()

Unnamed: 0,state_name_1,year,c_code_1,territory_change,territory_change_year,election_type,legislation_type,legislature_status,party_legal_status,party_existance,party_existance_outside_regime,legislature_parties,incumbent_type,collective_leadership,num_leadership_changes,leader_tenure,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,regime_type,transition_democracy,transition_dictatorship,age_govt,num_transitions_ever
0,Afghanistan,1946.0,700.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,18.0,0.0
1,Afghanistan,1947.0,700.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,19.0,0.0
2,Afghanistan,1948.0,700.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,20.0,0.0
3,Afghanistan,1949.0,700.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,21.0,0.0
4,Afghanistan,1950.0,700.0,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,22.0,0.0


In [31]:
wrp_df = pd.read_csv('data/WRP_national.csv', encoding = 'utf8')

wrp_df.rename({'judgen': 'total_jewish', 'chrstprot': 'total_protestant',
        'chrstcat': 'total_catholic', 'chrstgen': 'total_christian',
        'pop': 'total_population', 'nonrelig': 'total_non_religious',
        'budgen': 'total_buddhist', 'hindgen': 'total_hindu',
        'islmgen': 'total_islam', 'islmsun': 'total_sunni',
        'islmshi': 'total_shia', 'name': 'state_name_1',
        'state': 'c_code_1', 'shntgen': 'total_shinto',
        'sikhgen': 'total_sikh'}, axis = 1, inplace = True)

wrp_df.drop(['jdcons', 'judconspct', 'judgenpct', 'judorth',
      'judorthpct', 'judothr', 'judothrpct', 'judref',
      'judrefpct', 'chrstang', 'chrstangpct',
       'chrstcatpct', 'chrstgenpct', 'chrstorth',
       'chrstorthpct', 'chrstothr', 'chrstothrpct',
       'chrstprotpct', 'confgen', 'confgenpct',
       'Version', 'zorogen', 'zorogenpct', 'anmgen',
       'anmgenpct', 'bahgen', 'bahgenpct', 'budgenpct',
       'budmah', 'budmahpct', 'budothr', 'budothrpct',
       'budthr', 'budthrpct', 'datatype', 'dualrelig',
       'hindgenpct', 'islmahm', 'islmahmpct', 'islmalw',
       'islmalwpct', 'islmgenpct', 'islmibd', 'islmibdpct',
       'islmnat', 'islmnatpct', 'islmothr', 'islmothrpct',
       'islmshipct', 'islmsunpct', 'jaingen', 'jaingenpct',
       'nonreligpct', 'othrgen', 'othrgenpct', 'recreliab',
       'reliabilevel', 'shntgenpct', 'sikhgenpct',
       'sourcecode', 'sourcereliab', 'sumrelig', 'sumreligpct',
       'syncgen', 'syncgenpct', 'taogen', 'taogenpct',
       'total', 'total_population'], axis = 1, inplace = True)

In [32]:
wrp_df.head()

Unnamed: 0,year,c_code_1,state_name_1,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious
0,1945,2,USA,66069671,38716742,110265118,4641182,0,0,0,1601218,0,0,0,22874544
1,1950,2,USA,73090083,42635882,122994019,6090837,0,0,0,0,0,0,0,22568130
2,1955,2,USA,79294628,46402368,134001770,5333332,0,0,0,90173,0,0,0,23303540
3,1960,2,USA,90692928,50587880,150234347,5500000,0,0,0,2012131,0,0,0,21548225
4,1965,2,USA,94165803,64761783,167515758,5600000,0,0,0,1080892,0,0,0,19852362


In [33]:
c_code_df = pd.read_csv('data/COW country codes.csv', encoding = 'utf8')

c_code_df.rename({'CCode': 'c_code', 'StateNme': 'state_name'}, axis = 1, inplace = True)

c_code_df.drop(['StateAbb'], axis = 1, inplace = True)

In [34]:
c_code_dic = {}

for i, code in enumerate(c_code_df['c_code']):
    c_code_dic[code] = c_code_df['state_name'][i]

In [35]:
row_list = []
years = list(np.linspace(1945, 2010, 14))
for c_code in wrp_df['c_code_1'].unique():
    for year in years:
        try:
            assert len(list(flatten(wrp_df.loc[(wrp_df['year'] == year) & (wrp_df['c_code_1'] == c_code)].values))) > 0
            row_list.append(list(flatten(wrp_df.loc[(wrp_df['year'] == year) & (wrp_df['c_code_1'] == c_code)].values)))
        except:
            row_list.append([year, c_code, c_code_dic[c_code], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [36]:
wrp_df = pd.DataFrame(row_list)

wrp_df.columns = ['year', 'c_code_1', 'state_name_1', 'total_protestant',
         'total_catholic', 'total_christian', 'total_jewish',
         'total_sunni', 'total_shia', 'total_islam',
         'total_buddhist', 'total_hindu', 'total_sikh',
         'total_shinto', 'total_non_religious']

In [37]:
years[-6]

1985.0

In [38]:
years = list(np.linspace(1945, 2010, 14))
column_list = ['total_protestant', 'total_catholic', 'total_christian',
        'total_jewish', 'total_sunni', 'total_shia',
        'total_islam', 'total_buddhist', 'total_hindu',
        'total_sikh','total_shinto', 'total_non_religious']
fix_count_1 = 0
fix_count_2 = 0

for c_code in wrp_df['c_code_1'].unique():
    for column in column_list:
        data_list = []
        for year in years:
            data_list.append(wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == year)][column].values[0])
        for i, item in enumerate(data_list[1:]):
            i+=1
            if item == 0 and data_list[i-1] != 0 and i <= 4:
                if data_list[i+1] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+2] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+3] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+4] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/5)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+5] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+5])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/6)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+6] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+6])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/7)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+5]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+7] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+7])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/8)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+5]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+6]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+8] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+8])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/9)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+5]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+6]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+7]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                elif data_list[i+9] != 0:
                    rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+9])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])/10)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+5]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+6]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+7]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i+8]), column] = (9 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[i-1])][column].values[0])
                else:
                    print(c_code, years[i])
                    fix_count_1+=1
            elif item == 0 and data_list[i-1] != 0 and i > 4:
                if data_list[-2] != 0 and data_list[-3] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2])][column].values[0])
                elif data_list[-3] != 0 and data_list[-4] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0])
                elif data_list[-4] != 0 and data_list[-5] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                elif data_list[-5] != 0 and data_list[-6] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                elif data_list[-6] != 0 and data_list[-7] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                elif data_list[-7] != 0 and data_list[-8] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                elif data_list[-8] != 0 and data_list[-9] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                elif data_list[-9] != 0 and data_list[-10] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                elif data_list[-10] != 0 and data_list[-11] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (9 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                elif data_list[-11] != 0 and data_list[-12] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (9 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (10 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                elif data_list[-12] != 0 and data_list[-13] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-13])][column].values[0]))
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (4 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (5 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (6 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (7 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (8 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (9 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (10 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1]), column] = (11 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-12])][column].values[0])
                elif data_list[-1] != 0 and data_list[-3] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0])
                elif data_list[-2] != 0 and data_list[-4] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                elif data_list[-3] != 0 and data_list[-5] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                elif data_list[-4] != 0 and data_list[-6] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                elif data_list[-5] != 0 and data_list[-7] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                elif data_list[-6] != 0 and data_list[-8] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                elif data_list[-7] != 0 and data_list[-9] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                elif data_list[-8] != 0 and data_list[-10] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                elif data_list[-9] != 0 and data_list[-11] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])/2)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                elif data_list[-1] != 0 and data_list[-4] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0])
                elif data_list[-2] != 0 and data_list[-5] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                elif data_list[-3] != 0 and data_list[-6] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                elif data_list[-4] != 0 and data_list[-7] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                elif data_list[-5] != 0 and data_list[-8] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                elif data_list[-6] != 0 and data_list[-9] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                elif data_list[-7] != 0 and data_list[-10] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                elif data_list[-8] != 0 and data_list[-11] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])/3)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                elif data_list[-1] != 0 and data_list[-5] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-1])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0])
                elif data_list[-2] != 0 and data_list[-6] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-2])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0])
                elif data_list[-3] != 0 and data_list[-7] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-3])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0])
                elif data_list[-4] != 0 and data_list[-8] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-4])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8])][column].values[0])
                elif data_list[-5] != 0 and data_list[-9] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-5])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9])][column].values[0])
                elif data_list[-6] != 0 and data_list[-10] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-6])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10])][column].values[0])
                elif data_list[-7] != 0 and data_list[-11] != 0:
                    rate_of_change = rate_of_change = ((wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-7])][column].values[0] - wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])/4)
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-10]), column] = rate_of_change + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-9]), column] = (2 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                    wrp_df.loc[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-8]), column] = (3 * rate_of_change) + (wrp_df[(wrp_df['c_code_1'] == c_code) & (wrp_df['year'] == years[-11])][column].values[0])
                else:
                    print(c_code, years[i])
                    fix_count_2+=1

print(fix_count_1)
print(fix_count_2)

56 1985.0
70 2010.0
232 2000.0
260 1985.0
305 2000.0
305 2000.0
315 1975.0
343 2000.0
355 1950.0
369 2000.0
369 2000.0
371 2000.0
373 2000.0
380 2010.0
434 2010.0
620 1995.0
620 1995.0
625 1995.0
625 1995.0
651 2005.0
690 1995.0
692 1995.0
692 1995.0
696 1995.0
702 2000.0
702 2000.0
703 2000.0
703 2000.0
704 2005.0
705 2000.0
705 2000.0
705 2000.0
970 2010.0
1
32


In [39]:
wrp_df[wrp_df['c_code_1'] == 705]

Unnamed: 0,year,c_code_1,state_name_1,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious
2240,1945.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2241,1950.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2242,1955.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2243,1960.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2244,1965.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2245,1970.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2246,1975.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2247,1980.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2248,1985.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
2249,1990.0,705,Kazakhstan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [40]:
wrp_df[wrp_df['total_non_religious'] == 0].head()

Unnamed: 0,year,c_code_1,state_name_1,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious
14,1945.0,20,CAN,4002534.0,5038280.0,10531961.0,176690.0,0.0,0.0,0.0,115066.0,0.0,5000.0,0,0.0
28,1945.0,31,Bahamas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
29,1950.0,31,Bahamas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
30,1955.0,31,Bahamas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0
31,1960.0,31,Bahamas,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0


In [41]:
wrp_df.to_pickle('pickle/wrp_df.pkl')

In [42]:
len(wrp_df)

2800

In [43]:
one_df = pd.merge(trade_df_group, mat_cap_df, how = 'outer', on = ['c_code_1', 'year'])

one_df.rename({'state_name_1_x': 'state_name_1'}, axis = 1, inplace = True)

one_df.drop(['state_name_1_y'], axis = 1, inplace = True)

In [44]:
one_df.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score
0,2,United States of America,1870,36.0,256170000.0,359400000.0,13128000.0,50000.0,3384000000.0,75558000000.0,39905000.0,4130000.0,0.1
1,2,United States of America,1871,35.0,319960000.0,430510000.0,11811000.0,42000.0,3470000000.0,79092000000.0,40938000.0,4302000.0,0.1
2,2,United States of America,1872,32.0,313530000.0,353040000.0,14246000.0,42000.0,5180000000.0,96958000000.0,41972000.0,4481000.0,0.12
3,2,United States of America,1873,32.0,392890000.0,542810000.0,15014000.0,43000.0,5204000000.0,107348000000.0,43006000.0,4667000.0,0.12
4,2,United States of America,1874,32.0,317460000.0,553420000.0,12899000.0,44000.0,4878000000.0,103472000000.0,44040000.0,4862000.0,0.11


In [45]:
two_df = pd.merge(one_df, alliance_df_group, how = 'outer', on = ['c_code_1', 'year'])

two_df.rename({'state_name_1_x': 'state_name_1'}, axis = 1, inplace = True)

two_df.drop(['state_name_1_y'], axis = 1, inplace = True)

In [46]:
two_df.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties
0,2,United States of America,1870,36.0,256170000.0,359400000.0,13128000.0,50000.0,3384000000.0,75558000000.0,39905000.0,4130000.0,0.1,,,,,,,
1,2,United States of America,1871,35.0,319960000.0,430510000.0,11811000.0,42000.0,3470000000.0,79092000000.0,40938000.0,4302000.0,0.1,,,,,,,
2,2,United States of America,1872,32.0,313530000.0,353040000.0,14246000.0,42000.0,5180000000.0,96958000000.0,41972000.0,4481000.0,0.12,,,,,,,
3,2,United States of America,1873,32.0,392890000.0,542810000.0,15014000.0,43000.0,5204000000.0,107348000000.0,43006000.0,4667000.0,0.12,,,,,,,
4,2,United States of America,1874,32.0,317460000.0,553420000.0,12899000.0,44000.0,4878000000.0,103472000000.0,44040000.0,4862000.0,0.11,,,,,,,


In [47]:
three_df = pd.merge(two_df, mid_df_group, how = 'outer', on = ['c_code_1', 'year'])

three_df.drop(['state_name_abb_a'], axis = 1, inplace = True)

In [48]:
three_df.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode
0,2,United States of America,1870,36.0,256170000.0,359400000.0,13128000.0,50000.0,3384000000.0,75558000000.0,39905000.0,4130000.0,0.1,,,,,,,,2.0,2.0,1.0,0.06,0.0,0.5,0.0,0.0,0,0.0,4,3,"[8, 19]","[3, 4]"
1,2,United States of America,1871,35.0,319960000.0,430510000.0,11811000.0,42000.0,3470000000.0,79092000000.0,40938000.0,4302000.0,0.1,,,,,,,,,,,,,,,,,,,,,
2,2,United States of America,1872,32.0,313530000.0,353040000.0,14246000.0,42000.0,5180000000.0,96958000000.0,41972000.0,4481000.0,0.12,,,,,,,,,,,,,,,,,,,,,
3,2,United States of America,1873,32.0,392890000.0,542810000.0,15014000.0,43000.0,5204000000.0,107348000000.0,43006000.0,4667000.0,0.12,,,,,,,,2.0,1.0,1.5,0.07,0.0,1.0,0.0,0.5,"[0, 2]",0.0,"[3, 5]","[1, 3]","[2, 14]","[2, 3]"
4,2,United States of America,1874,32.0,317460000.0,553420000.0,12899000.0,44000.0,4878000000.0,103472000000.0,44040000.0,4862000.0,0.11,,,,,,,,,,,,,,,,,,,,,


In [49]:
four_df = pd.merge(three_df, wrp_df, how = 'outer', on = ['c_code_1', 'year'])

four_df.rename({'state_name_1_x': 'state_name_1'}, axis = 1, inplace = True)

four_df.drop(['state_name_1_y'], axis = 1, inplace = True)

In [50]:
four_df.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious
0,2,United States of America,1870,36.0,256170000.0,359400000.0,13128000.0,50000.0,3384000000.0,75558000000.0,39905000.0,4130000.0,0.1,,,,,,,,2.0,2.0,1.0,0.06,0.0,0.5,0.0,0.0,0,0.0,4,3,"[8, 19]","[3, 4]",,,,,,,,,,,,
1,2,United States of America,1871,35.0,319960000.0,430510000.0,11811000.0,42000.0,3470000000.0,79092000000.0,40938000.0,4302000.0,0.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,2,United States of America,1872,32.0,313530000.0,353040000.0,14246000.0,42000.0,5180000000.0,96958000000.0,41972000.0,4481000.0,0.12,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,2,United States of America,1873,32.0,392890000.0,542810000.0,15014000.0,43000.0,5204000000.0,107348000000.0,43006000.0,4667000.0,0.12,,,,,,,,2.0,1.0,1.5,0.07,0.0,1.0,0.0,0.5,"[0, 2]",0.0,"[3, 5]","[1, 3]","[2, 14]","[2, 3]",,,,,,,,,,,,
4,2,United States of America,1874,32.0,317460000.0,553420000.0,12899000.0,44000.0,4878000000.0,103472000000.0,44040000.0,4862000.0,0.11,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [51]:
df = pd.merge(four_df, gov_df, how = 'inner', on = ['c_code_1', 'year'])

df = df.sort_values(by = 'year', ascending = True).reset_index()

df.rename({'state_name_1_x': 'state_name_1'}, axis = 1, inplace = True)

df.drop(['state_name_1_y', 'index'], axis = 1, inplace = True)

In [52]:
df.head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious,territory_change,territory_change_year,election_type,legislation_type,legislature_status,party_legal_status,party_existance,party_existance_outside_regime,legislature_parties,incumbent_type,collective_leadership,num_leadership_changes,leader_tenure,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,regime_type,transition_democracy,transition_dictatorship,age_govt,num_transitions_ever
0,2,United States of America,1946,65.0,160000000.0,14.4,45133984000.0,3030000.0,120842000000.0,2376288000000.0,141389000.0,39725000.0,0.36,19.0,0.0,0.0,19.0,0.0,0.0,19.0,2.0,1.0,1.5,0.27,0.0,0.5,0.0,0.0,0.0,"[0, 1]",5.0,3.0,"[7, 14]",3.0,,,,,,,,,,,,,0.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,77.0,0.0
1,678,Yemen Arab Republic,1946,65.0,0.0,0.0,,18000.0,0.0,0.0,3140000.0,0.0,0.0,9.0,0.0,0.0,0.0,1.0,7.0,9.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,3.0,0.0,,,,,,0.0,0.0,0.0,21.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,39.0,0.0
2,350,Greece,1946,65.0,0.0,1310000.0,95622000.0,120000.0,0.0,856000000.0,7400000.0,914000.0,0.0,,,,,,,,1.0,1.0,1.0,0.84,0.0,1.0,0.0,0.0,0.0,0,0.0,0.0,19,4.0,,,,,,,,,,,,,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0
3,700,Afghanistan,1946,65.0,0.0,0.0,,90000.0,0.0,0.0,8549000.0,225000.0,0.0,5.0,0.0,0.0,0.0,1.0,5.0,3.0,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,3.0,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,14.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,18.0,0.0
4,345,Yugoslavia,1946,65.0,0.0,0.0,301243000.0,159000.0,404000000.0,7904000000.0,15186000.0,876000.0,0.01,4.0,0.0,0.0,4.0,0.0,1.0,1.0,2.0,1.0,1.0,0.03,0.0,1.0,0.0,0.0,0.0,0,5.0,3.0,19,4.0,,,,,,,,,,,,,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,17.0,1.0


In [53]:
df.loc[df['export_dollars'] == 0, 'export_dollars'] = None
df.loc[df['import_dollars'] == 0, 'import_dollars'] = None
df.loc[df['cinc_score'] == 0, 'cinc_score'] = None

df.loc[df['num_alliances'].isnull(), 'num_alliances'] = 0
df.loc[df['pre_1816_alliances'].isnull(), 'pre_1816_alliances'] = 0
df.loc[df['num_in_effect_1231_2012'].isnull(), 'num_in_effect_1231_2012'] = 0
df.loc[df['defense_treaties'].isnull(), 'defense_treaties'] = 0
df.loc[df['neutrality_treaties'].isnull(), 'neutrality_treaties'] = 0
df.loc[df['nonaggression_treaties'].isnull(), 'nonaggression_treaties'] = 0
df.loc[df['entente_treaties'].isnull(), 'entente_treaties'] = 0
df.loc[df['num_conflicts'].isnull(), 'num_conflicts'] = 0
df.loc[df['avg_cum_duration'].isnull(), 'avg_cum_duration'] = 0
df.loc[df['num_wars'].isnull(), 'num_wars'] = 0
df.loc[df['state_fatality_bin_mode'].isnull(), 'state_fatality_bin_mode'] = 0
df.loc[df['revision_type_mode'].isnull(), 'revision_type_mode'] = 0

df.loc[df['highest_mca_mode'].isnull(), 'highest_mca_mode'] = 1
df.loc[df['highest_hostility_mode'].isnull(), 'highest_hostility_mode'] = 1

# df.loc[df['avg_with'].isnull(), 'avg_with'] = 0
# df.loc[df['avg_against'].isnull(), 'avg_against'] = 0
# df.loc[df['origin_participant_pct'].isnull(), 'origin_participant_pct'] = 0
# df.loc[df['revision_pct'].isnull(), 'revision_pct'] = 0

In [54]:
df.to_pickle('pickle/df.pkl')

In [55]:
df[df['state_name_1'].isnull()].head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious,territory_change,territory_change_year,election_type,legislation_type,legislature_status,party_legal_status,party_existance,party_existance_outside_regime,legislature_parties,incumbent_type,collective_leadership,num_leadership_changes,leader_tenure,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,regime_type,transition_democracy,transition_dictatorship,age_govt,num_transitions_ever
299,223,,1950,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,81.0,0.0
332,740,,1950,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,150000.0,100000.0,676000.0,0.0,0.0,0.0,0.0,59852100.0,4500.0,0.0,65452839.0,1572000.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,5.0,0.0
333,260,,1950,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,1.0
340,760,,1950,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,44.0,0.0
342,265,,1950,,,,,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0


In [56]:
len(df['c_code_1'].unique())

198

In [57]:
len(df)

8990

In [58]:
df.columns

Index(['c_code_1', 'state_name_1', 'year', 'num_trade_states',
       'export_dollars', 'import_dollars', 'military_expenditure',
       'military_personnel', 'iron_steel_prod', 'prim_energy_consumption',
       'total_pop', 'urban_pop', 'cinc_score', 'num_alliances',
       'pre_1816_alliances', 'num_in_effect_1231_2012', 'defense_treaties',
       'neutrality_treaties', 'nonaggression_treaties', 'entente_treaties',
       'num_conflicts', 'avg_with', 'avg_against', 'avg_cum_duration',
       'num_wars', 'origin_participant_pct', 'ongoing_2010', 'revision_pct',
       'revision_type_mode', 'state_fatality_bin_mode', 'outcome_mode',
       'settlement_type_mode', 'highest_mca_mode', 'highest_hostility_mode',
       'total_protestant', 'total_catholic', 'total_christian', 'total_jewish',
       'total_sunni', 'total_shia', 'total_islam', 'total_buddhist',
       'total_hindu', 'total_sikh', 'total_shinto', 'total_non_religious',
       'territory_change', 'territory_change_year', 'elect

In [59]:
df[([s % 5 == 0 for s in df['year']])].head()

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious,territory_change,territory_change_year,election_type,legislation_type,legislature_status,party_legal_status,party_existance,party_existance_outside_regime,legislature_parties,incumbent_type,collective_leadership,num_leadership_changes,leader_tenure,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,regime_type,transition_democracy,transition_dictatorship,age_govt,num_transitions_ever
281,100,Colombia,1950,74.0,340400000.1,371600000.1,41327000.0,17000.0,0.0,5926000000.0,11334000.0,1954000.0,0.0,21.0,0.0,20.0,20.0,0.0,21.0,20.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,60000.0,10427280.0,10487280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,2.0,2.0,2.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0
282,770,Pakistan,1950,74.0,308899999.5,322319999.2,199398000.0,246000.0,6000000.0,3408000000.0,74620000.0,3635000.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,0.3,0.0,1.0,0.0,0.5,"[0, 1]","[1, 2]",5,3,18,4,496969.0,278333.0,936227.0,198.0,0.0,0.0,67158000.0,198000.0,5969600.0,99245.0,0.0,3783.0,1.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,4.0,0.0
283,678,Yemen Arab Republic,1950,74.0,,,,18000.0,0.0,2000000.0,3324000.0,0.0,0.0,13.0,0.0,0.0,4.0,1.0,11.0,13.0,0.0,,,0.0,0.0,,,,0,0,,,1,1,0.0,0.0,0.0,300000.0,1350000.0,1650000.0,3000000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,,,,,,0.0,0.0,0.0,3.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,43.0,0.0
284,900,Australia,1950,74.0,1345599995.5,1457960002.0,169340000.0,38000.0,2550000000.0,34312000000.0,8179000.0,4343000.0,0.01,1.0,0.0,0.0,1.0,0.0,0.0,1.0,2.0,2.0,17.0,0.38,2.0,0.0,0.0,0.0,0,4,0,0,22,5,1480362.0,1925471.0,7804437.0,35006.0,0.0,0.0,17994.0,9978.0,0.0,998.0,0.0,280376.0,0.0,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,50.0,0.0
285,365,Russia,1950,74.0,252100002.0,248909998.9,15510433000.0,4300000.0,54658000000.0,547936000000.0,180075000.0,33388000.0,0.18,11.0,0.0,0.0,10.0,2.0,4.0,3.0,2.0,2.5,1.5,0.9,0.0,1.0,0.0,1.0,2,0,"[0, 5]","[0, 3]","[7, 17]","[3, 4]",3857122.0,1900000.0,61415885.0,3907628.0,0.0,0.0,26000000.0,754514.0,0.0,0.0,0.0,86668696.0,0.0,0.0,3.0,2.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,29.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,35.0,0.0


In [60]:
df[df['military_expenditure'].isnull()]

Unnamed: 0,c_code_1,state_name_1,year,num_trade_states,export_dollars,import_dollars,military_expenditure,military_personnel,iron_steel_prod,prim_energy_consumption,total_pop,urban_pop,cinc_score,num_alliances,pre_1816_alliances,num_in_effect_1231_2012,defense_treaties,neutrality_treaties,nonaggression_treaties,entente_treaties,num_conflicts,avg_with,avg_against,avg_cum_duration,num_wars,origin_participant_pct,ongoing_2010,revision_pct,revision_type_mode,state_fatality_bin_mode,outcome_mode,settlement_type_mode,highest_mca_mode,highest_hostility_mode,total_protestant,total_catholic,total_christian,total_jewish,total_sunni,total_shia,total_islam,total_buddhist,total_hindu,total_sikh,total_shinto,total_non_religious,territory_change,territory_change_year,election_type,legislation_type,legislature_status,party_legal_status,party_existance,party_existance_outside_regime,legislature_parties,incumbent_type,collective_leadership,num_leadership_changes,leader_tenure,military_leader,royal_leader,nominal_vs_eff_diff,communist_leader,leader_died,democratic_regime,cabinet_assembly,popular_election,regime_type,transition_democracy,transition_dictatorship,age_govt,num_transitions_ever
1,678,Yemen Arab Republic,1946,65.00,,,,18000.00,0.00,0.00,3140000.00,0.00,0.00,9.00,0.00,0.00,0.00,1.00,7.00,9.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,3.00,0.00,,,,,,0.00,0.00,0.00,21.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,5.00,0.00,0.00,39.00,0.00
3,700,Afghanistan,1946,65.00,,,,90000.00,0.00,0.00,8549000.00,225000.00,0.00,5.00,0.00,0.00,0.00,1.00,5.00,3.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,3.00,2.00,2.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,14.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,5.00,0.00,0.00,18.00,0.00
7,712,Mongolia,1946,65.00,,,,,0.00,36000000.00,757000.00,0.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,3.00,1.00,2.00,1.00,1.00,1.00,1.00,0.00,0.00,0.00,7.00,0.00,0.00,1.00,1.00,1.00,0.00,0.00,0.00,3.00,0.00,0.00,26.00,0.00
14,790,Nepal,1946,65.00,,,,20000.00,0.00,0.00,7333000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,3.00,0.00,0.00,2.00,1.00,1.00,0.00,0.00,0.00,0.00,2.00,0.00,0.00,1.00,0.00,1.00,0.00,0.00,0.00,3.00,0.00,0.00,24.00,0.00
22,40,Cuba,1946,65.00,,,,18000.00,0.00,162000000.00,5040000.00,1224000.00,0.00,19.00,0.00,0.00,19.00,0.00,0.00,19.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,1.00,2.00,2.00,2.00,2.00,2.00,2.00,0.00,0.00,0.00,14.00,1.00,0.00,1.00,0.00,1.00,1.00,0.00,1.00,2.00,0.00,0.00,11.00,0.00
29,670,Saudi Arabia,1946,65.00,,,,4000.00,0.00,23436000000.00,2930000.00,198000.00,0.00,9.00,0.00,4.00,0.00,1.00,7.00,9.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,3.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,15.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,5.00,0.00,0.00,31.00,0.00
34,530,Ethiopia,1946,65.00,,,,10000.00,0.00,6000000.00,17652000.00,257000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,1.00,0.00,3.00,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,17.00,0.00,1.00,0.00,0.00,1.00,0.00,0.00,0.00,5.00,0.00,0.00,77.00,0.00
36,450,Liberia,1946,65.00,,,,5000.00,0.00,2000000.00,664000.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,1.00,2.00,2.00,2.00,2.00,2.00,1.00,0.00,0.00,0.00,3.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,3.00,0.00,0.00,77.00,0.00
38,93,Nicaragua,1946,65.00,,,,3000.00,0.00,0.00,949000.00,0.00,0.00,19.00,0.00,0.00,19.00,0.00,0.00,19.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,1.00,0.00,2.00,2.00,2.00,2.00,2.00,1.00,0.00,0.00,10.00,1.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,4.00,0.00,0.00,10.00,0.00
62,95,Panama,1946,65.00,,,,0.00,0.00,16000000.00,721000.00,0.00,0.00,19.00,0.00,0.00,19.00,0.00,0.00,19.00,0.00,,,0.00,0.00,,,,0,0,,,1,1,,,,,,,,,,,,,0.00,0.00,1.00,2.00,2.00,2.00,2.00,2.00,2.00,0.00,0.00,0.00,2.00,0.00,0.00,0.00,0.00,1.00,0.00,0.00,0.00,3.00,0.00,0.00,44.00,0.00
