## Project 5 - Data Cleaning

### Import libraries

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 25)

### Load data

In [2]:
df = pd.read_csv('yemen.csv', encoding='cp1252')

### Checkout null values

In [3]:
df.isnull().sum()

Incident ID                0
Date                       0
Governorate                0
District                 238
Area                    3299
Target                     0
Main category              0
Sub-category               0
Min Air Raids              0
Max Air Raids              2
Civilian Casualties        0
Fatalities                 0
Woman fatalities           0
Child fatalities           0
Injured                    0
Woman injured              0
Child injured              2
 Confirmed Time          146
Time of Day                0
Unnamed: 19            22360
Unnamed: 20            22360
Unnamed: 21            22360
Unnamed: 22            22436
dtype: int64

**Drop Columns:** 
- Unnamed: 19 
- Unnamed: 20 
- Unnamed: 21 
- Unnamed: 22 

These columns are mainly missing values and the information in the non-missing cells is duplicated in other columns.

In [4]:
df.drop(columns = ['Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

In [5]:
len(df['Area'].unique())

5805

In [6]:
#df = df.drop(columns=['Unnamed: 19', 'Unnamed: 20',
#       'Unnamed: 21', 'Unnamed: 22'])

In [7]:
# df['Child injured'] = df['Child injured'].fillna(df['Child injured'].mean())
# df['Max Air Raids'] = df['Max Air Raids'].fillna(df['Max Air Raids'].mean())

In [8]:
df['Civilian Casualties'].value_counts()

0      20667
2        224
5        212
1        199
3        194
       ...  
77         1
101        1
85         1
100        1
95         1
Name: Civilian Casualties, Length: 89, dtype: int64

In [9]:
df['Main category'].value_counts()

Unknown                     8496
Military_Security_Target    7385
Civilian                    2879
Infrastructure              1447
Economic_infrastructure     1080
Economic_Infrastructure      298
Political_Tribal             294
Educational_facility         240
Educational_Facility         150
Cultural_heritage             73
Medical_Facility              60
Medical_facility              26
Media                         24
international_community       13
Political_Tribal              11
International_community        6
cultural_heritage              2
International_Community        1
Name: Main category, dtype: int64

### Clean up columns

#### Column names

In [10]:
# Clean up column names: Remove spaces, make lowercase, replace space with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')

In [11]:
df.columns

Index(['incident_id', 'date', 'governorate', 'district', 'area', 'target',
       'main_category', 'sub_category', 'min_air_raids', 'max_air_raids',
       'civilian_casualties', 'fatalities', 'woman_fatalities',
       'child_fatalities', 'injured', 'woman_injured', 'child_injured',
       'confirmed_time', 'time_of_day'],
      dtype='object')

#### Column values

In [12]:
# date: make index and convert to datetime 
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)
df.index

DatetimeIndex(['2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26',
               ...
               '2020-12-29', '2020-12-30', '2020-12-30', '2020-12-30',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=22485, freq=None)

In [13]:
# incident id

In [14]:
# governorate
df['governorate'].value_counts()

Saada        5199
Taiz         2661
Sanaa        2568
Hajja        2429
Marib        2248
Hudaydah     1811
Capital      1418
Jawf         1284
Bayda         576
Amran         451
Lahj          395
Shabwa        320
Aden          293
Ibb           266
Dhalie        200
Dhamar        183
Abyan          85
Mahwit         72
Raymah         12
Hadramawt       8
lahj            2
Marib           2
Maharah         2
Name: governorate, dtype: int64

In [15]:
# district
district_list = []
for value in df['district']:
    value = str(value).lower().strip()
    district_list.append(value)

df['district'] = district_list

In [16]:
# area
df['area'].value_counts()

-                4360
The City          250
Al-Nahdain        247
Al-Rahabah        177
Al-Omary          145
                 ... 
Al Shuqayra’a       1
Al-Marhah           1
Al Abous            1
Bani Ali            1
Bani Shaddar        1
Name: area, Length: 5804, dtype: int64

In [17]:
# target
df['target'].value_counts()

Unknown                                      8364
Pro-Houthi forces                            2123
Residential Area                              590
Pro-Houthi sites                              403
Pro-Houthi positions                          396
                                             ... 
Pro-Houthi forces’ military training camp       1
Reasidential Areas                              1
Wadi mor bridge                                 1
Al-Janad Military Camp                          1
Storaeg unit                                    1
Name: target, Length: 2884, dtype: int64

In [18]:
# main_category
main_category_list = []
for value in df['main_category']:
    value = value.lower().strip()
    main_category_list.append(value)
df['main_category'] = main_category_list

In [19]:
# sub-category
sub_category_list = []
for value in df['sub_category']:
    value = value.lower().strip()
    sub_category_list.append(value)
df['sub_category'] = sub_category_list

In [20]:
# min_air_raids
df['min_air_raids'].isnull().sum()

0

In [21]:
# max_air_raids
max_air_raids = []
for value in df['max_air_raids']:
    if value == '2,2':
        val = 2
    elif value != '-' and type(value) != float:
        val = int(value.strip('+'))
    elif type(value) == float:
        val = value
    else:
        val = 0
    max_air_raids.append(val)
df['max_air_raids'] = max_air_raids

In [22]:
# civilian_casualties
df['civilian_casualties'].isnull().sum()

0

In [23]:
# fatalities
df['fatalities'].isnull().sum()

0

In [24]:
# woman_fatalities
df['woman_fatalities'].isnull().sum()

0

In [25]:
# child_fatalities
df['child_fatalities'].isnull().sum()

0

In [26]:
# injured
df['injured'].isnull().sum()

0

In [27]:
# woman_injured
df['woman_injured'].isnull().sum()

0

In [28]:
# child_injured
df['child_injured'] = df['child_injured'].fillna(df['child_injured'].median())

In [29]:
# confirmed_time
df['confirmed_time'] = df['confirmed_time'].replace(
    {'Unkn': 'Unknown', 'unkn': 'Unknown', '41': 'Unknown'})
df['confirmed_time'].value_counts()

Unknown    9187
18          959
17          952
16          873
19          831
21          819
20          807
22          772
24          748
23          671
11          595
1           589
15          581
10          550
12          504
13          461
9           452
14          435
2           385
8           242
3           241
4           222
5           164
6           151
7           148
Name: confirmed_time, dtype: int64

In [30]:
# time_of_day 
df['time_of_day'] = df['time_of_day'].str.lower()
df['time_of_day'] = df['time_of_day'].replace([' midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['evening'], 'night')
df['time_of_day'] = df['time_of_day'].replace(['early-morning'], 'morning')
df['time_of_day'] = df['time_of_day'].replace(['early morning'], 'morning')

In [31]:
df['target'].value_counts()

Unknown                                      8364
Pro-Houthi forces                            2123
Residential Area                              590
Pro-Houthi sites                              403
Pro-Houthi positions                          396
                                             ... 
Pro-Houthi forces’ military training camp       1
Reasidential Areas                              1
Wadi mor bridge                                 1
Al-Janad Military Camp                          1
Storaeg unit                                    1
Name: target, Length: 2884, dtype: int64

In [32]:
df

Unnamed: 0_level_0,incident_id,governorate,district,area,target,main_category,sub_category,min_air_raids,max_air_raids,civilian_casualties,fatalities,woman_fatalities,child_fatalities,injured,woman_injured,child_injured,confirmed_time,time_of_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-03-26,1,Capital,bani al-harith,Al-Rahabah,Al-Daylami Airbase,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,2,morning
2015-03-26,2,Capital,bani al-harith,Airport,Sana'a International Airport,infrastructure,transport,1,2.0,0,0,0,0,0,0,0.0,2,morning
2015-03-26,3,Capital,bani al-harith,Al-Sonblah Neighbourhood,Residential Area,civilian,residential area,1,2.0,29,21,3,14,8,0,7.0,2,morning
2015-03-26,4,Capital,al-sab'ein,Al-Nahdain,Presidential Palace,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,morning
2015-03-26,5,Capital,al-thawrah,Al-Nahdhah,Former 1st Armoured Division,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31,22482,Sanaa,sanhan,Raymat Humayd Military Camp,Raymat Humayd Military Camp,military_security_target,military site,1,5.0,0,0,0,0,0,0,0.0,1,morning
2020-12-31,22483,Sanaa,sanhan,Raymat Humayd Military Camp,Raymat Humayd Military Camp,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,2,morning
2020-12-31,22484,Sanaa,bani hushaysh,Wadi Rijam,Unknown,unknown,unknown,1,2.0,0,0,0,0,0,0,0.0,3,morning
2020-12-31,22485,Capital,bani al-harith,Ar Rahabah,Al-Daylami Air base,military_security_target,military site,1,1.0,0,0,0,0,0,0,0.0,1,morning


In [33]:
df[df.index == '2015-03-28']

Unnamed: 0_level_0,incident_id,governorate,district,area,target,main_category,sub_category,min_air_raids,max_air_raids,civilian_casualties,fatalities,woman_fatalities,child_fatalities,injured,woman_injured,child_injured,confirmed_time,time_of_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-03-28,68,Capital,azal,Noqom,Weapons Storage,military_security_target,weapon storage,1,2.0,0,0,0,0,0,0,0.0,3,morning
2015-03-28,69,Capital,al-wahdah,Hadda,Residence of former Presiden Ali Abdullah Saleh,political_tribal,figure,1,2.0,0,0,0,0,0,0,0.0,3,morning
2015-03-28,70,Capital,bani al-harith,Al-Rahabah,Al-Daylami Airbase,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,4,morning
2015-03-28,71,Capital,ma'ain,Al-Sunainah,Aviation College,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,Unknown,morning
2015-03-28,72,Capital,al-thawrah,Al-Nahdhah,Former 1st Armoured Division,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,19,night
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2015-03-28,96,Saada,al-dhaher,Al-Malaheet,105th Infantry Brigade (Al-Kamb Military Camp),military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,Unknown,night
2015-03-28,97,Saada,sa'ada,Qahzah,Al-Saifi Military Camp,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,Unknown,night
2015-03-28,98,Saada,saqain,Al-Sha'af,Unknown,unknown,unknown,1,3.0,0,0,0,0,0,0,0.0,Unknown,night
2015-03-28,99,Saada,sa'ada,Kahlan,Kahlan Military Camp,military_security_target,military site,1,8.0,0,0,0,0,0,0,0.0,23,night
