## Project 5 - Data Cleaning

### Import libraries

In [1]:
import pandas as pd

### Load data

In [2]:
df = pd.read_csv('yemen.csv', encoding='cp1252')

### Checkout null values

In [3]:
df.isnull().sum()

Incident ID                0
Date                       0
Governorate                0
District                 238
Area                    3299
Target                     0
Main category              0
Sub-category               0
Min Air Raids              0
Max Air Raids              2
Civilian Casualties        0
Fatalities                 0
Woman fatalities           0
Child fatalities           0
Injured                    0
Woman injured              0
Child injured              2
 Confirmed Time          146
Time of Day                0
Unnamed: 19            22360
Unnamed: 20            22360
Unnamed: 21            22360
Unnamed: 22            22436
dtype: int64

**Drop Columns:** 
- Unnamed: 19 
- Unnamed: 20 
- Unnamed: 21 
- Unnamed: 22 

These columns are mainly missing values and the information in the non-missing cells is duplicated in other columns.

In [4]:
df.drop(columns = ['Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

In [5]:
len(df['Area'].unique())

5805

In [6]:
#df = df.drop(columns=['Unnamed: 19', 'Unnamed: 20',
#       'Unnamed: 21', 'Unnamed: 22'])

In [7]:
# df['Child injured'] = df['Child injured'].fillna(df['Child injured'].mean())
# df['Max Air Raids'] = df['Max Air Raids'].fillna(df['Max Air Raids'].mean())

In [8]:
df['Civilian Casualties'].value_counts()

0      20667
2        224
5        212
1        199
3        194
       ...  
77         1
101        1
85         1
100        1
95         1
Name: Civilian Casualties, Length: 89, dtype: int64

In [9]:
df['Main category'].value_counts()

Unknown                     8496
Military_Security_Target    7385
Civilian                    2879
Infrastructure              1447
Economic_infrastructure     1080
Economic_Infrastructure      298
Political_Tribal             294
Educational_facility         240
Educational_Facility         150
Cultural_heritage             73
Medical_Facility              60
Medical_facility              26
Media                         24
international_community       13
Political_Tribal              11
International_community        6
cultural_heritage              2
International_Community        1
Name: Main category, dtype: int64

### Clean up columns

#### Column names

In [10]:
# Clean up column names: Remove spaces, make lowercase, replace space with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')

In [11]:
df.columns

Index(['incident_id', 'date', 'governorate', 'district', 'area', 'target',
       'main_category', 'sub_category', 'min_air_raids', 'max_air_raids',
       'civilian_casualties', 'fatalities', 'woman_fatalities',
       'child_fatalities', 'injured', 'woman_injured', 'child_injured',
       'confirmed_time', 'time_of_day'],
      dtype='object')

#### Column values

In [12]:
# date: make index and convert to datetime 
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)
df.index

DatetimeIndex(['2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26',
               ...
               '2020-12-29', '2020-12-30', '2020-12-30', '2020-12-30',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=22485, freq=None)

In [13]:
# incident id

In [15]:
df[df['governorate'] == 'sanaa']

Unnamed: 0_level_0,incident_id,governorate,district,area,target,main_category,sub_category,min_air_raids,max_air_raids,civilian_casualties,fatalities,woman_fatalities,child_fatalities,injured,woman_injured,child_injured,confirmed_time,time_of_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1


In [None]:
# governorate
df['governorate'].value_counts()

In [None]:
# district
district_list = []
for value in df['district']:
    value = str(value).lower().strip()
    district_list.append(value)

df['district'] = district_list

In [None]:
# area
df['area'].value_counts()

In [None]:
# target
df['target'].value_counts()

In [None]:
# main_category
main_category_list = []
for value in df['main_category']:
    value = value.lower().strip()
    main_category_list.append(value)
df['main_category'] = main_category_list

In [None]:
# sub-category
sub_category_list = []
for value in df['sub_category']:
    value = value.lower().strip()
    sub_category_list.append(value)
df['sub_category'] = sub_category_list

In [None]:
# min_air_raids
df['min_air_raids'].isnull().sum()

In [None]:
# max_air_raids
max_air_raids = []
for value in df['max_air_raids']:
    if value == '2,2':
        val = 2
    elif value != '-' and type(value) != float:
        val = int(value.strip('+'))
    elif type(value) == float:
        val = value
    else:
        val = 0
    max_air_raids.append(val)
df['max_air_raids'] = max_air_raids

In [None]:
# civilian_casualties
df['civilian_casualties'].isnull().sum()

In [None]:
# fatalities
df['fatalities'].isnull().sum()

In [None]:
# woman_fatalities
df['woman_fatalities'].isnull().sum()

In [None]:
# child_fatalities
df['child_fatalities'].isnull().sum()

In [None]:
# injured
df['injured'].isnull().sum()

In [None]:
# woman_injured
df['woman_injured'].isnull().sum()

In [None]:
# child_injured
df['child_injured'] = df['child_injured'].fillna(df['child_injured'].median())

In [None]:
# confirmed_time
df['confirmed_time'] = df['confirmed_time'].replace(
    {'Unkn': 'Unknown', 'unkn': 'Unknown', '41': 'Unknown'})
df['confirmed_time'].value_counts()

In [None]:
# time_of_day 
df['time_of_day'] = df['time_of_day'].str.lower()
df['time_of_day'] = df['time_of_day'].replace([' midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['evening'], 'night')
df['time_of_day'] = df['time_of_day'].replace(['early-morning'], 'morning')
df['time_of_day'] = df['time_of_day'].replace(['early morning'], 'morning')

In [None]:
df['target'].value_counts()

In [None]:
df

In [None]:
df[df.index == '2015-03-28']