In [None]:
import pandas as pd
import numpy as np

In [None]:
wars = pd.read_csv('ucdp-term-dyadic-2015.csv') # load in data
wars

In [None]:
# drop unused columns
dropped = wars.drop(['DyadEp', 'SideBID', 'StartPrec', 'StartDate2', 'StartPrec2', 'Dyadterm', 
           'EpEndPrec', 'Outcome_early', 'GWNoA', 'GWNoA2nd', 'GWNoB', 'GWNoB2nd', 'GWNoLoc', 'Version', 'Type2'], axis=1)
dropped.head()

In [None]:
# replace values in Incompatibility column
incompatibility = {1:'Territory', 2:'Government', 3:'Government and Territory'}
dropped['Incompatibility'] = dropped['Incompatibility'].replace(incompatibility)
dropped.head()

In [None]:
# replace values in IntensityLevel column
intensity = {1:'Minor (between 25 and 999 battle-related deaths)', 2:'War (at least 1000 battle-related deaths)'}
dropped['IntensityLevel'] = dropped['IntensityLevel'].replace(intensity)
dropped.head()

In [None]:
# replace values in TypeOfConflict column
conflict = {1:'Extrasystemic armed conflict', 2:'Interstate armed conflict', 3:'Internal armed conflict',
             4:'Internationalized internal armed'}
dropped['TypeOfConflict'] = dropped['TypeOfConflict'].replace(conflict)
dropped.head()

In [None]:
# replace values in Outcome column
outcome = {'1':'Peace agreement', '2':'Ceasefire', '3':'Victory for Side A /Government Side',
           '4':'Victory for Side B /Rebel Side', '5':'Low activity (less than 25 battle-deaths)', 
           '6':'Actor ceases to exist', '.':np.nan}
dropped['Outcome'] = dropped['Outcome'].replace(outcome)
dropped.head()

In [None]:
# # replace values in Region column
dropped['Region'] = dropped['Region'].str.replace('1', 'Europe')
dropped['Region'] = dropped['Region'].str.replace('2', 'Middle East')
dropped['Region'] = dropped['Region'].str.replace('3', 'Asia')
dropped['Region'] = dropped['Region'].str.replace('4', 'Africa')
dropped['Region'] = dropped['Region'].str.replace('5', 'Americas')
dropped.head()

In [None]:
# replace periods with NaN
period = {'.':np.nan}
dropped = dropped.replace(period)
dropped.head()

In [None]:
# rename some columns
dropped = dropped.rename({'SideA2nd':'SideA_Ally', 'SideB2nd':'SideB_Ally', 'EpEndDate':'EndDate', 
                'CfireDate':'CeasefireDate', 'PeAgDate':'PeaceAgreementDate', 'Incompatibility':'DisagreementOver'
                         }, axis=1)

In [None]:
dropped.head()

In [None]:
dropped[(~dropped["Location"].str.contains(",")) & (dropped["TypeOfConflict"].str.contains('Internal armed conflict'))]

In [None]:
dropped.set_index('Year').to_csv('cleanedWars2.csv')

### Edits to dataset after meeting with professor

In [None]:
# Drop remaining columns that are not needed after meeting with professor
dropped = dropped.drop(['Year', 'SideA_Ally', 'SideB_Ally', 'TerritoryName', 'CeasefireDate', 
                        'PeaceAgreementDate'], axis = 1)
dropped.head()

In [None]:
# Taking out locations with commas ","", i.e. interstate wars
dropped = dropped[~dropped["Location"].str.contains(",")]
dropped[dropped["Location"].str.contains(",")]

In [None]:
# Dropping remaining interstate conflicts
dropped = dropped[dropped["TypeOfConflict"].str.contains('Internal armed conflict')]
dropped.head()

In [None]:
# replace NaN values with "Missing" string
dropped = dropped.fillna('Missing')
dropped.head()

In [None]:
# Extracting start years of wars from start dates
dropped['StartDate'] = pd.to_datetime(dropped['StartDate'])
dropped['StartYear'] = dropped['StartDate'].dt.year
dropped.head()

In [None]:
# Extracting end years of wars from end dates

# replace missing end dates with NaT so can convert to datetime and extract year
dropped['EndYear'] = dropped['EndDate']
dropped['EndYear'] = dropped['EndYear'].replace('Missing','NaT')

dropped['EndYear'] = pd.to_datetime(dropped['EndYear'])
dropped['EndYear'] = dropped['EndYear'].dt.year
dropped = dropped.fillna('Missing')

dropped.head(10)

In [None]:
# One approach of getting one row per conflict, but 
# some conflicts still have more than one row since more than one end date was listed

conflict_rows = dropped[dropped['EndDate'] != 'Missing']
conflict_rows.head()

### The code below produces the final dataframe that is currently used in the Civil Wars notebook

In [None]:
#  Another approach for getting one row per conflict where there is one row per conflict 

#Replace missing with NaN again so that can use function in apply
one = dropped.replace('Missing', np.nan)

# One row per conflict grouped by ConflictId and DyadId
one.groupby(['ConflictId', 'DyadId'], sort = False)['EndYear'].apply(np.nanmax)
idx = one.groupby(['ConflictId', 'DyadId'])['EndYear'].transform(max) == one['EndYear']


one_row = one[idx]
one_row.head()

In [None]:
# Reformatting values including start and end date formats 

one_row.fillna('Missing')
one_row['StartDate'] = one_row['StartDate'].dt.strftime('%m/%d/%Y')
one_row['EndDate'] = pd.to_datetime(one_row['EndDate'])
one_row['EndDate'] = one_row['EndDate'].dt.strftime('%m/%d/%Y')

one_row.head()

In [None]:
# Dropping StartYear and EndYear columns

one_row.drop(['StartYear', 'EndYear'], axis = 1, inplace = True)
one_row.head()

In [None]:
one_row[one_row["Outcome"].str.contains("Missing")]

In [None]:
approach_1 = conflict_rows.shape[0]
approach_2 = one_row.shape[0]

print('Approach 1 number of rows: ', approach_1)
print('Approach 2 number of rows:', approach_2)

In [None]:
# by default, pd.to_datetime assumes 2-digit years < 70 are in the 2000s
# reformat appropriate dates to be in the 1900s
def fix_century(date):
    if (date[-2:] > "18"):
        date = date[:6] + "19" + date[-2:]
    return date
one_row.loc[:, "EndDate"] = one_row["EndDate"].apply(fix_century)

# sanity check: start date should always be before end date
# this currently fails because start date > end date in original data set for row 2556
#assert (pd.to_datetime(one_row["StartDate"]) < pd.to_datetime(one_row["EndDate"])).all()

#sanity check: all dates should be between 1935 and 2017
assert (one_row["StartDate"].str.slice(6, 10) > "1935").all()
assert (one_row["StartDate"].str.slice(6, 10) < "2018").all()
assert (one_row["EndDate"].str.slice(6, 10) > "1935").all()
assert (one_row["EndDate"].str.slice(6, 10) < "2018").all()

In [None]:
one_row.set_index('ConflictId').to_csv('oneWar.csv')