In [23]:
import pandas as pd
from pycountry_convert import country_name_to_country_alpha3
import requests
import unidecode

In [4]:
column_names = ['EventCode', 'Target_CountryCode_2', 'Target_Lat', 'Target_Long', 'IsRootEvent',
                'QuadClass', 'GoldsteinScale', 'AvgTone', 'NumMentions', 'NumSources', 'NumArticles',
                'Target_GeoType', 'Day', 'Target_CountryCode', 'Source_CountryName', 'Target_CountryName']

In [5]:
df_all = pd.read_csv('translingual_2015-2017_cleaned_no_url.csv', names=column_names, encoding='latin-1')

# country name to country code

In [8]:


countries_to_discard = [
    'Near and Middle East Regional',
    'Caribbean Regional',
    'Asia Regional',
    'International',
    'Europe Regional',
    'Central Africa Republic',
    'Africa Regional',
    'Americas Regional',
    'Central America Regional',
    'United Nations',
    'Latin America',
    'NOWEBSITE',
    'NOENTRY',
]

r = requests.get('https://raw.githubusercontent.com/mledoze/countries/master/countries.json')
dict_countries_to_cca3 = dict([(c['name']['common'], c['cca3']) for c in r.json()])

def country_to_cca3(c):
    c = c.strip()
    if c in countries_to_discard:
        return 'DISCARDED'
    try:
        c = dict_countries_to_cca3[c]
    except KeyError:
        try:
            c = country_name_to_country_alpha3(c)
        except KeyError as e:
            no_accent = unidecode.unidecode(c)
            if c == no_accent:
                raise e
            else:
                return country_to_cca3(no_accent)
    return c


mismatches = [
    ('Holy See', 'Vatican City'),
    ('Wallis and Futuna Islands', 'Wallis and Futuna'),
    ('Reunion', 'Réunion'),
    ('Congo Kinshasa', 'Congo'),
    ('Timor Leste', 'Timor-Leste'),
    ('Congo Brazzaville', 'Congo'),
    ('Dutch Caribbean', 'Netherlands'),
    ('Cote d\'Ivoire', 'Côte d\'Ivoire'),
    ('Curacao', 'Curaçao'),
    ('Svalbard and Jan Mayen Islands', 'Norway'),
    ('Faeroe Islands', 'Faroe Islands'),
    ('Guinea Bissau', 'Guinea-Bissau')
]

for new_c, to_c in mismatches:
    dict_countries_to_cca3[new_c] = country_to_cca3(to_c)


        

In [9]:
df_all['Source_CountryCode'] = df_all['Source_CountryName'].apply(country_to_cca3)

In [10]:
discard_mask = df_all['Source_CountryCode'] != 'DISCARDED'
n_not_discarded = discard_mask.sum()
total = len(df_all)

(n_not_discarded / total) * 100

76.332893579487532

In [11]:
df_all = df_all[discard_mask]

In [14]:
new_column_names = ['Day', 'EventCode', 'Source_CountryCode', 'Target_CountryCode', 'Target_Lat', 'Target_Long', 'Target_GeoType',
                'IsRootEvent', 'QuadClass', 'GoldsteinScale', 'AvgTone', 'NumMentions', 'NumSources', 'NumArticles']

In [15]:
df_all = df_all[new_column_names]

In [19]:
df_all[-10:]

Unnamed: 0,Day,EventCode,Source_CountryCode,Target_CountryCode,Target_Lat,Target_Long,Target_GeoType,IsRootEvent,QuadClass,GoldsteinScale,AvgTone,NumMentions,NumSources,NumArticles
711400770,20171130,20,USA,VEN,8.0,-66.0,1,0,1,3.0,-3.397213,1,1,1
711400771,20171130,84,USA,VEN,8.0,-66.0,1,0,2,7.0,-3.397213,6,1,6
711400772,20171130,20,USA,ZWE,-17.8178,31.0447,4,0,1,3.0,-3.397213,1,1,1
711400773,20171130,20,USA,VEN,8.0,-66.0,1,0,1,3.0,-3.397213,1,1,1
711400774,20171130,84,USA,ZWE,-17.8178,31.0447,4,0,2,7.0,-3.397213,1,1,1
711400775,20171130,84,USA,VEN,8.0,-66.0,1,0,2,7.0,-3.397213,1,1,1
711400776,20171130,173,ESP,VEN,10.5,-66.9167,4,1,4,-5.0,-5.124654,6,1,6
711400777,20171130,233,LBN,YEM,15.5,47.5,1,1,1,3.4,-1.851852,10,1,10
711400783,20171130,15,BIH,HRV,45.166667,15.5,1,1,1,0.0,-2.027027,20,1,10
711400785,20171130,43,BRA,BRA,-32.0333,-52.0833,4,1,1,2.8,-2.537313,10,1,10


In [20]:
with open('data_cleaned_powaaa.csv', 'w', encoding='utf-8') as f:
    df_all.to_csv(f, encoding='utf-8')