# Data cleaning

## Libraries used 

In [1]:
import pandas as pd
import os

## Loading data

In [2]:
data_filepath = os.path.join('data','globalterrorismdb_0617dist.xlsx')

data = pd.read_excel(data_filepath)

In [3]:
columns_to_drop = [
    'approxdate',
    'extended',
    'resolution',
    'summary',
    'crit1',
    'crit2',
    'crit3',
    'doubtterr',
    'alternative',
    'alternative_txt',
    'multiple',
    'related',
    'provstate',
    #'city',
    'vicinity',
    'location',
    'specificity',
    'success',
    'weapsubtype1',
    'weapsubtype1_txt',
    'weapsubtype2',
    'weapsubtype2_txt',
    'weapsubtype3',
    'weapsubtype3_txt',
    'weapsubtype4',
    'weapsubtype4_txt',
    'corp1',
    'target1',
    'natlty1',
    'natlty1_txt',
    'corp2',
    'target2',
    'natlty2',
    'natlty2_txt',
    'corp3',
    'target3',
    'natlty3',
    'natlty3_txt',
    'gsubname',
    'gsubname2',
    'gsubname3',
    'individual',
    'nperps',
    'nperpcap',
    'claimmode',
    'claimmode_txt',
    'compclaim',
    'claimmode2',
    'claimmode2_txt',
    'claimmode3',
    'claimmode3_txt',
    'nkillus',
    'nkillter',
    #'nwouldus',
    'nwoundte',
    'property',
    'propextent',
    'propextent_txt',
    'propvalue',
    'propcomment',
    'ishostkid',
    'nhostkid',
    #'ishostkidus',
    'nhostkidus',
    'nhours',
    'ndays',
    'divert',
    'kidhijcountry',
    'ransom',
    'ransomamt',
    #'ransomus',
    'ransomamtus',
    'ransompaid',
    'ransomnote',
    'hostkidoutcome',
    'hostkidoutcome_txt',
    'nreleased',
    'addnotes',
    'INT_LOG',
    'INT_IDEO',
    'INT_MISC',
    'INT_ANY',
    'scite1',
    'scite2',
    'scite3',
    'dbsource'
]

In [4]:
data.drop(columns_to_drop, axis=1, inplace=True)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 170350 entries, 0 to 170349
Data columns (total 53 columns):
eventid             170350 non-null int64
iyear               170350 non-null int64
imonth              170350 non-null int64
iday                170350 non-null int64
country             170350 non-null int64
country_txt         170350 non-null object
region              170350 non-null int64
region_txt          170350 non-null object
city                169904 non-null object
latitude            165744 non-null float64
longitude           165744 non-null float64
suicide             170350 non-null int64
attacktype1         170350 non-null int64
attacktype1_txt     170350 non-null object
attacktype2         5630 non-null float64
attacktype2_txt     5630 non-null object
attacktype3         374 non-null float64
attacktype3_txt     374 non-null object
targtype1           170350 non-null int64
targtype1_txt       170350 non-null object
targsubtype1        161005 non-null float64


### Looking at the data

From the info above, we can observe that not all the columns in our data are populated for each data entry. One set of columns that are really important for our visualization, latitude and longitude, is found between in columns with missing values. Given that our project has a big focus on the geo-location dimmension of the database, a decision has to be made.

Possible solutions:
* observing that for these data entries, the country is specified, we can infer approximate values for the latitude and longitude
* looking at the distribution by year and by country of the data, we can see that the number of data entries with missing localization data is rather small comparing to the number of data entries containing geolocalization. Therefore dropping those entries would not have a big impact  


In [6]:
# we set this in order to not have truncated dataframes when printed
pd.options.display.max_rows = 999

In order to back-up the first possible solution, we look at the number of data entries with missing values. Then we check if every data entry that is missing the geolocation has the country specified. We do this by grouping by country and sum the counts. The resulted number should be equal to the number of data entries with missing geolocation.

In [7]:
len(data[data.latitude.isnull()])

4606

In [8]:
data[data.latitude.isnull()].groupby('country_txt').count()['eventid'].sum()

4606

We can confirm that every data entry with missing geolocation has the country specified.

**Second solution**: We now try to gather more insight into the other possible solution: dropping the problematic entries.

For this, we want to know what fraction of the data has missing geolocation, considering first by country and then by year.

In [9]:
full_country_counts = data['country_txt'].value_counts()
missing_country_counts = data[data.latitude.isnull()]['country_txt'].value_counts()

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
missing_fraction_country = missing_country_counts.divide(full_country_counts, fill_value=0)
missing_fraction_country

Afghanistan                         0.007695
Albania                             0.012658
Algeria                             0.048736
Andorra                             1.000000
Angola                              0.068966
Antigua and Barbuda                 0.000000
Argentina                           0.017370
Armenia                             0.000000
Australia                           0.000000
Austria                             0.009174
Azerbaijan                          0.063830
Bahamas                             0.000000
Bahrain                             0.043011
Bangladesh                          0.043614
Barbados                            0.000000
Belarus                             0.000000
Belgium                             0.000000
Belize                              0.125000
Benin                               0.125000
Bhutan                              0.166667
Bolivia                             0.031847
Bosnia-Herzegovina                  0.006289
Botswana  

At a first glance, the number of data entries with missing geolocation is rather small comparing with the total number of entries. 
Which mean that dropping this entries would not affect out visualization.
But we will deep further do identify if there are cases where the entries with missing geolocation account for a big part of the data.

In [12]:
significant_missing = missing_fraction_country[missing_fraction_country >= 0.5]
significant_missing

Andorra                1.0
Equatorial Guinea      0.5
South Yemen            0.5
St. Kitts and Nevis    0.5
Name: country_txt, dtype: float64

We see that for three countries half of the data is missing the geolocation and all data entries for Andorra. Thinking about completing the data by hand, we will now see how many cases are for each country.

In [13]:
missing_country_counts.loc[significant_missing.index]

Andorra                1
Equatorial Guinea      1
South Yemen            1
St. Kitts and Nevis    1
Name: country_txt, dtype: int64

There is only one entry per country, therefore we can put this data in by hand.

We can see below the details about these attacks in order to Google search a more precise location.

In [14]:
data[data.country_txt.isin(significant_missing.index)].query('latitude != latitude')

Unnamed: 0,eventid,iyear,imonth,iday,country,country_txt,region,region_txt,city,latitude,longitude,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,targsubtype1,targsubtype1_txt,targtype2,targtype2_txt,targsubtype2,targsubtype2_txt,targtype3,targtype3_txt,targsubtype3,targsubtype3_txt,gname,gname2,gname3,motive,guncertain1,guncertain2,guncertain3,claimed,claim2,claim3,weaptype1,weaptype1_txt,weaptype2,weaptype2_txt,weaptype3,weaptype3_txt,weaptype4,weaptype4_txt,weapdetail,nkill,nwound,nwoundus,ransompaidus
2397,197407140002,1974,7,14,7,Andorra,8,Western Europe,Unknown,,,0,2,Armed Assault,,,,,1,Business,3.0,Bank/Commerce,,,,,,,,,International Revolutionary Action Group (GARI),,,,0.0,,,,,,5,Firearms,,,,,,,Pistols,,,,
17846,198301200014,1983,1,20,406,South Yemen,10,Middle East & North Africa,Unknown,,,0,4,Hijacking,,,,,6,Airports & Aircraft,42.0,Aircraft (not at an airport),,,,,,,,,Palestinians,,,,0.0,,,,,,5,Firearms,,,,,,,Firearm,0.0,2.0,,
51181,199205040002,1992,5,4,62,Equatorial Guinea,11,Sub-Saharan Africa,Unknown,,,0,1,Assassination,,,,,14,Private Citizens & Property,83.0,Protester,,,,,,,,,Unknown,,,,0.0,,,,,,13,Unknown,,,,,,,,1.0,0.0,,
59591,199506250005,1995,6,25,189,St. Kitts and Nevis,2,Central America & Caribbean,Unknown,,,0,2,Armed Assault,,,,,14,Private Citizens & Property,84.0,Political Party Member/Rally,,,,,,,,,Unknown,,,,0.0,,,,,,8,Incendiary,,,,,,,,0.0,0.0,,


After multiple tries, we did not manage to find more data about these attacks on the internet. Therefore, because we want our visualization to not introduce misinformation, we will discard them except Andorra.

Given that there is just this case of terrorist attack and that the country is rather small, in order to reflect in out visualization that there was this kind of attack there, we can use the center of the country as the location for this attack.

** Missing entries distribution per year**

In [15]:
full_yearly_counts = data['iyear'].value_counts()
missing_yearly_counts = data[data.latitude.isnull()]['iyear'].value_counts()

In [16]:
missing_fraction_yearly = missing_yearly_counts.divide(full_yearly_counts, fill_value=0)
missing_fraction_yearly

1970    0.012289
1971    0.019149
1972    0.008065
1973    0.016913
1974    0.006897
1975    0.017568
1976    0.024919
1977    0.020470
1978    0.047837
1979    0.049605
1980    0.040195
1981    0.049516
1982    0.055403
1983    0.072822
1984    0.087268
1985    0.083362
1986    0.064336
1987    0.064070
1988    0.068280
1989    0.042100
1990    0.062259
1991    0.060645
1992    0.048492
1994    0.048872
1995    0.052905
1996    0.050065
1997    0.065000
1998    0.046088
1999    0.038710
2000    0.023166
2001    0.012585
2002    0.016517
2003    0.011886
2004    0.024957
2005    0.015928
2006    0.015278
2007    0.015736
2008    0.010410
2009    0.003179
2010    0.001659
2011    0.005916
2012    0.006118
2013    0.009170
2014    0.006465
2015    0.006800
2016    0.004004
Name: iyear, dtype: float64

In [17]:
missing_fraction_yearly[missing_fraction_yearly >= 0.1]

Series([], Name: iyear, dtype: float64)

We can remark that the number of data entries with missing geolocation is smaller than 10% for each year. Therefore, taking into consideration the year also, dropping these entries will not have a great impact for our visualization

## Decision

We decided to go with the second solution because infering the approximate values could create artificial patterns in our vizualisation and that would mean giving wrong information to our user. We will make an exception for Andorra where there is only one recorded attack in the entire database and so is worth infering the location from the country for this specific case.

In [18]:
andorra_lat = 42.544033
andorra_long = 1.556309
data.loc[data['country_txt']=='Andorra', 'latitude'] = andorra_lat
data.loc[data['country_txt']=='Andorra', 'longitude'] = andorra_long

We can now keep only the data entries that have geolocation data.

In [19]:
data = data[~data.latitude.isnull()]

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 165745 entries, 0 to 170349
Data columns (total 53 columns):
eventid             165745 non-null int64
iyear               165745 non-null int64
imonth              165745 non-null int64
iday                165745 non-null int64
country             165745 non-null int64
country_txt         165745 non-null object
region              165745 non-null int64
region_txt          165745 non-null object
city                165316 non-null object
latitude            165745 non-null float64
longitude           165745 non-null float64
suicide             165745 non-null int64
attacktype1         165745 non-null int64
attacktype1_txt     165745 non-null object
attacktype2         5557 non-null float64
attacktype2_txt     5557 non-null object
attacktype3         370 non-null float64
attacktype3_txt     370 non-null object
targtype1           165745 non-null int64
targtype1_txt       165745 non-null object
targsubtype1        156622 non-null float64


Inspecting the info above about the columns data type, we can remark that the type is object only for columns that contain text. This mean the data is consistent in terms of values for the other columns (i.e. if integers are used as a codification, then they are used throughout all data entries) 

In [21]:
data.head()

Unnamed: 0,eventid,iyear,imonth,iday,country,country_txt,region,region_txt,city,latitude,longitude,suicide,attacktype1,attacktype1_txt,attacktype2,attacktype2_txt,attacktype3,attacktype3_txt,targtype1,targtype1_txt,targsubtype1,targsubtype1_txt,targtype2,targtype2_txt,targsubtype2,targsubtype2_txt,targtype3,targtype3_txt,targsubtype3,targsubtype3_txt,gname,gname2,gname3,motive,guncertain1,guncertain2,guncertain3,claimed,claim2,claim3,weaptype1,weaptype1_txt,weaptype2,weaptype2_txt,weaptype3,weaptype3_txt,weaptype4,weaptype4_txt,weapdetail,nkill,nwound,nwoundus,ransompaidus
0,197000000001,1970,7,2,58,Dominican Republic,2,Central America & Caribbean,Santo Domingo,18.456792,-69.951164,0,1,Assassination,,,,,14,Private Citizens & Property,68.0,Named Civilian,,,,,,,,,MANO-D,,,,0.0,,,,,,13,Unknown,,,,,,,,1.0,0.0,,
1,197000000002,1970,0,0,130,Mexico,1,North America,Mexico city,19.432608,-99.133207,0,6,Hostage Taking (Kidnapping),,,,,7,Government (Diplomatic),45.0,"Diplomatic Personnel (outside of embassy, cons...",,,,,,,,,23rd of September Communist League,,,,0.0,,,,,,13,Unknown,,,,,,,,0.0,0.0,,
2,197001000001,1970,1,0,160,Philippines,5,Southeast Asia,Unknown,15.478598,120.599741,0,1,Assassination,,,,,10,Journalists & Media,54.0,Radio Journalist/Staff/Facility,,,,,,,,,Unknown,,,,0.0,,,,,,13,Unknown,,,,,,,,1.0,0.0,,
3,197001000002,1970,1,0,78,Greece,8,Western Europe,Athens,37.983773,23.728157,0,3,Bombing/Explosion,,,,,7,Government (Diplomatic),46.0,Embassy/Consulate,,,,,,,,,Unknown,,,,0.0,,,,,,6,Explosives/Bombs/Dynamite,,,,,,,Explosive,,,,
4,197001000003,1970,1,0,101,Japan,4,East Asia,Fukouka,33.580412,130.396361,0,7,Facility/Infrastructure Attack,,,,,7,Government (Diplomatic),46.0,Embassy/Consulate,,,,,,,,,Unknown,,,,0.0,,,,,,8,Incendiary,,,,,,,Incendiary,,,,


As discussed in the project book,  we will keep certain columns are reference. For this, we will create a dictionary containing the columns with their conventions which we will later save as json.

In [22]:
conventions_dict = {}

In [23]:
conventions_dict['attacktype'] = data[['attacktype1', 'attacktype1_txt']].set_index('attacktype1').to_dict()['attacktype1_txt']
conventions_dict['attacktype']

{1: 'Assassination',
 2: 'Armed Assault',
 3: 'Bombing/Explosion',
 4: 'Hijacking',
 5: 'Hostage Taking (Barricade Incident)',
 6: 'Hostage Taking (Kidnapping)',
 7: 'Facility/Infrastructure Attack',
 8: 'Unarmed Assault',
 9: 'Unknown'}

In [24]:
conventions_dict['targtype'] = data[['targtype1','targtype1_txt']].set_index('targtype1').to_dict()['targtype1_txt']
conventions_dict['targtype']

{1: 'Business',
 2: 'Government (General)',
 3: 'Police',
 4: 'Military',
 5: 'Abortion Related',
 6: 'Airports & Aircraft',
 7: 'Government (Diplomatic)',
 8: 'Educational Institution',
 9: 'Food or Water Supply',
 10: 'Journalists & Media',
 11: 'Maritime',
 12: 'NGO',
 13: 'Other',
 14: 'Private Citizens & Property',
 15: 'Religious Figures/Institutions',
 16: 'Telecommunication',
 17: 'Terrorists/Non-State Militia',
 18: 'Tourists',
 19: 'Transportation',
 20: 'Unknown',
 21: 'Utilities',
 22: 'Violent Political Party'}

In [25]:
targsubtype_notnull_index = data.targsubtype1.notnull()
conventions_dict['targsubtype'] = data[targsubtype_notnull_index][['targsubtype1','targsubtype1_txt']].set_index('targsubtype1').to_dict()['targsubtype1_txt']
conventions_dict['targsubtype']

{1.0: 'Gas/Oil',
 2.0: 'Restaurant/Bar/Café',
 3.0: 'Bank/Commerce',
 4.0: 'Multinational Corporation',
 5.0: 'Industrial/Textiles/Factory',
 6.0: 'Medical/Pharmaceutical',
 7.0: 'Retail/Grocery/Bakery',
 8.0: 'Hotel/Resort',
 9.0: 'Farm/Ranch',
 10.0: 'Mining',
 11.0: 'Entertainment/Cultural/Stadium/Casino',
 12.0: 'Construction',
 13.0: 'Private Security Company/Firm',
 14.0: 'Judge/Attorney/Court',
 15.0: 'Politician or Political Party Movement/Meeting/Rally',
 16.0: 'Royalty',
 17.0: 'Head of State',
 18.0: 'Government Personnel (excluding police, military)',
 19.0: 'Election-related',
 20.0: 'Intelligence',
 21.0: 'Government Building/Facility/Office',
 22.0: 'Police Building (headquarters, station, school)',
 23.0: 'Police Patrol (including vehicles and convoys)',
 24.0: 'Police Checkpoint',
 25.0: 'Police Security Forces/Officers',
 26.0: 'Prison/Jail',
 27.0: 'Military Barracks/Base/Headquarters/Checkpost',
 28.0: 'Military Recruiting Station/Academy',
 29.0: 'Military Unit/Pat

In [27]:
conventions_dict['weaptype'] = data[['weaptype1','weaptype1_txt']].set_index('weaptype1').to_dict()['weaptype1_txt']
if 4 not in conventions_dict['weaptype']:
    conventions_dict['weaptype'][4]='Nuclear'
conventions_dict['weaptype']

{1: 'Biological',
 2: 'Chemical',
 3: 'Radiological',
 4: 'Nuclear',
 5: 'Firearms',
 6: 'Explosives/Bombs/Dynamite',
 7: 'Fake Weapons',
 8: 'Incendiary',
 9: 'Melee',
 10: 'Vehicle (not to include vehicle-borne explosives, i.e., car or truck bombs)',
 11: 'Sabotage Equipment',
 12: 'Other',
 13: 'Unknown'}

In [28]:
conventions_dict['country'] = data[['country', 'country_txt']].set_index('country').to_dict()['country_txt']
conventions_dict['country']

{4: 'Afghanistan',
 5: 'Albania',
 6: 'Algeria',
 7: 'Andorra',
 8: 'Angola',
 10: 'Antigua and Barbuda',
 11: 'Argentina',
 12: 'Armenia',
 14: 'Australia',
 15: 'Austria',
 16: 'Azerbaijan',
 17: 'Bahamas',
 18: 'Bahrain',
 19: 'Bangladesh',
 20: 'Barbados',
 21: 'Belgium',
 22: 'Belize',
 23: 'Benin',
 25: 'Bhutan',
 26: 'Bolivia',
 28: 'Bosnia-Herzegovina',
 29: 'Botswana',
 30: 'Brazil',
 31: 'Brunei',
 32: 'Bulgaria',
 33: 'Burkina Faso',
 34: 'Burundi',
 35: 'Belarus',
 36: 'Cambodia',
 37: 'Cameroon',
 38: 'Canada',
 41: 'Central African Republic',
 42: 'Chad',
 43: 'Chile',
 44: 'China',
 45: 'Colombia',
 46: 'Comoros',
 47: 'Republic of the Congo',
 49: 'Costa Rica',
 50: 'Croatia',
 51: 'Cuba',
 53: 'Cyprus',
 54: 'Czech Republic',
 55: 'Denmark',
 56: 'Djibouti',
 57: 'Dominica',
 58: 'Dominican Republic',
 59: 'Ecuador',
 60: 'Egypt',
 61: 'El Salvador',
 62: 'Equatorial Guinea',
 63: 'Eritrea',
 64: 'Estonia',
 65: 'Ethiopia',
 66: 'Falkland Islands',
 67: 'Fiji',
 68: 'F

In [29]:
conventions_dict['region'] = data[['region','region_txt']].set_index('region').to_dict()['region_txt']
conventions_dict['region']

{1: 'North America',
 2: 'Central America & Caribbean',
 3: 'South America',
 4: 'East Asia',
 5: 'Southeast Asia',
 6: 'South Asia',
 7: 'Central Asia',
 8: 'Western Europe',
 9: 'Eastern Europe',
 10: 'Middle East & North Africa',
 11: 'Sub-Saharan Africa',
 12: 'Australasia & Oceania'}

We can now drop the textual columns because we created the dictionary with their value.

In [38]:
textual_columns_to_drop = ['country_txt','region_txt','attacktype1_txt','attacktype2_txt','attacktype3_txt',
                          'targtype1_txt','targsubtype1_txt','targtype2_txt','targsubtype2_txt','targtype3_txt',
                          'targsubtype3_txt','weaptype1_txt','weaptype2_txt','weaptype3_txt',
                          'weaptype4_txt']
data = data.drop(textual_columns_to_drop,axis=1)

ValueError: labels ['country_txt' 'region_txt' 'attacktype1_txt' 'attacktype2_txt'
 'attacktype3_txt' 'targtype1_txt' 'targsubtype1_txt' 'targtype2_txt'
 'targsubtype2_txt' 'targtype3_txt' 'targsubtype3_txt' 'weaptype1_txt'
 'weaptype2_txt' 'weaptype3_txt' 'weaptype4_txt'] not contained in axis

Now we can dump the dataframe to a json file. A thing to notice here is because we wanted the json file the have a size as small as possible, we chose the orient option to be values. This means that the json file will contain an array of arrays (a matrix), where each row is a data entry. We will also create a json file with the columns so that we know what the columns represent

In [39]:
data.to_json('cleaned_data.json', orient='values')
columns_dict = {}
for col, idx in zip(data.columns, range(len(data.columns))):
    columns_dict[col] = idx

We now dump to json the metadata.

In [41]:
import json
with open('conventions.json', 'w') as fp:
    json.dump(str(conventions_dict), fp)
    
with open('columns.json', 'w') as fp:
    json.dump(str(columns_dict), fp)