## Project 5 - Data Cleaning

### Import libraries

In [20]:
import pandas as pd
import math

pd.set_option('display.max_rows', 10)

### Load data

In [21]:
df = pd.read_csv('data/raw_yemen_data.csv', encoding='cp1252')

### Checkout null values

In [22]:
df.isnull().sum()

Incident ID        0
Date               0
Governorate        0
District         238
Area            3299
               ...  
Time of Day        0
Unnamed: 19    22360
Unnamed: 20    22360
Unnamed: 21    22360
Unnamed: 22    22436
Length: 23, dtype: int64

**Drop Columns:** 
- Unnamed: 19 
- Unnamed: 20 
- Unnamed: 21 
- Unnamed: 22 

These columns are mainly missing values and the information in the non-missing cells is duplicated in other columns.

In [23]:
df.drop(columns = ['Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

In [24]:
len(df['Area'].unique())

5805

In [25]:
#df = df.drop(columns=['Unnamed: 19', 'Unnamed: 20',
#       'Unnamed: 21', 'Unnamed: 22'])

In [None]:
df['Civilian Casualties'].value_counts()

In [None]:
df['Main category'].value_counts()

### Clean up columns

#### Column names

In [None]:
# Clean up column names: Remove spaces, make lowercase, replace space with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')

In [None]:
df.columns

#### Column values


date: Make index and convert to datetime

In [None]:

df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)
df.index


incident id: Check for duplicates

In [None]:
any([x > 1 for x in df['incident_id'].value_counts()])

governorate: Check value counts.

In [None]:
df['governorate'].value_counts()

Check 'Capital' - There is 'Capital' which may be 'Sanaa' or 'Aden'
(The capital is currently disputed in Yemen, historically it is Sanaa but recently it's Aden)

Resesarch on district in relation to capital
* if distric is bani al-harith, then capital = Sanaa
* if district is al-sab'ein, then capital = Sanaa
* if ma'ain then Sanaa
* if al-wahdah then Sanaa
* if azal then Sanaa
* shu'oub near Sanaa

In [None]:
df[df['governorate']== 'Capital']

In [None]:
# Rename Capital cells to Sanaa
df['governorate'] = df['governorate'].replace(['Capital'], 'Sanaa')

district: clean strings

In [None]:
district_list = []
for value in df['district']:
    value = str(value).lower().strip()
    district_list.append(value)

df['district'] = district_list

area: Change null values to string 'Unknown'.

In [None]:
df['area'] = df['area'].fillna('Unknown')

target: Show values of Target

In [None]:
# target
df['target'].value_counts()


main_catagory:  Clean values to better display in graphs and further eda.

In [None]:
# main_category
main_category_list = []
for value in df['main_category']:
    value = value.lower().strip()
    main_category_list.append(value)
df['main_category'] = main_category_list


sub_catagory:  Clean values to better display in graphs and further eda.

In [None]:
# sub-category
sub_category_list = []
for value in df['sub_category']:
    value = value.lower().strip()
    sub_category_list.append(value)
df['sub_category'] = sub_category_list

min air raids:  show null values

In [None]:
# min_air_raids
df['min_air_raids'].isnull().sum()

max_air_raids: Clean values. We understood the max air raids value to be a baseline estimate by the Yemen Data Project.
The noted that some air raids mave have had more than the number listed and marked those values with a (+) sign.  We
removed these signs for our data and used their baseline known value for max air raids as our value. We also know that
there are 2 null values in the data.  We are replacing those two values with the median number of airraids as to not
lose data.

In [None]:
# max_air_raids
max_air_raids = []
for value in df['max_air_raids']:
    if value == '2,2':  # This was an error in the csv file
        val = 2
    elif value != '-' and type(value) != float:
        val = int(value.strip('+'))
    elif type(value) == float:
        val = value
    else:
        val = 0
    max_air_raids.append(val)
df['max_air_raids'] = max_air_raids
df['max_air_raids'] = df['max_air_raids'].fillna(df['max_air_raids'].median())



civilian_casualties:  show null values

In [None]:
# civilian_casualties
df['civilian_casualties'].isnull().sum()

fatalities:  show null values

In [None]:
# fatalities
df['fatalities'].isnull().sum()

woman_fatalities:  show null values

In [None]:
# woman_fatalities
df['woman_fatalities'].isnull().sum()


child_fatalities:  show null values

In [None]:
# child_fatalities
df['child_fatalities'].isnull().sum()

injured:  show null values

In [None]:
# injured
df['injured'].isnull().sum()

woman_injured:  show null values

In [None]:
# woman_injured
df['woman_injured'].isnull().sum()

child_injured:  There are 2 null values for child injured. We are replacing those values with the median.

In [None]:
# child_injured
df['child_injured'] = df['child_injured'].fillna(df['child_injured'].median())

confirmed_time: Change multiple formats for 'Unknown'. To one uniform notation.

In [None]:
# confirmed_time
df['confirmed_time'] = df['confirmed_time'].replace(
    {'Unkn': 'Unknown', 'unkn': 'Unknown', '41': 'Unknown'}).fillna('18')

In [None]:
df[df['confirmed_time'] == 'Unknown']

In [None]:
#confirmed_time_list = []
#for value in list(zip(df['confirmed_time'], df['time_of_day'])):
#    if value[0] != 'Unknown' or type(value[0]) == object or type(value[0]) == str:
#        val = value[0]
#    elif math.isnan(value[0]) == True:
#        val = 18
#    elif value[0] == 'Unknown':
#        if value[1].lower().strip() == 'early-morning':
#            val = 3
#        elif value[1].lower().strip() == 'morning':
#            val = 7
#        elif value[1].lower().strip() == 'midday':
#            val = 11 
#        elif value[1].lower().strip() == 'afternoon':
#            val = 14
#        elif value[1].lower().strip() == 'evening':
#            val = 18
#        elif value[1].lower().strip() == 'night':
#            val = 22
#    confirmed_time_list.append(val)
#
#df['confirmed_time'] = confirmed_time_list

#Anyone wants to mess with this to fill 146/22500 values feel free haha

time_of_day: We cleaned the data, then replaced more specific timeframes to a simpler (morning, afternoon, night) format.

In [None]:
# time_of_day 
df['time_of_day'] = df['time_of_day'].str.lower()
df['time_of_day'] = df['time_of_day'].replace([' midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['evening'], 'night')
df['time_of_day'] = df['time_of_day'].replace(['early-morning'], 'morning')
df['time_of_day'] = df['time_of_day'].replace(['early morning'], 'morning')

In [None]:
# Save cleaned dataframe to csv in data directory for use in modeling.
df.to_csv('data/clean_df.csv')