## Project 5 - Data Cleaning

### Import libraries

In [1]:
import pandas as pd
pd.set_option('display.max_rows', 10)

### Load data

In [2]:
df = pd.read_csv('yemen.csv', encoding='cp1252')

### Checkout null values

In [3]:
df.isnull().sum()

Incident ID        0
Date               0
Governorate        0
District         238
Area            3299
               ...  
Time of Day        0
Unnamed: 19    22360
Unnamed: 20    22360
Unnamed: 21    22360
Unnamed: 22    22436
Length: 23, dtype: int64

**Drop Columns:** 
- Unnamed: 19 
- Unnamed: 20 
- Unnamed: 21 
- Unnamed: 22 

These columns are mainly missing values and the information in the non-missing cells is duplicated in other columns.

In [4]:
df.drop(columns = ['Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

In [5]:
len(df['Area'].unique())

5805

In [6]:
#df = df.drop(columns=['Unnamed: 19', 'Unnamed: 20',
#       'Unnamed: 21', 'Unnamed: 22'])

In [7]:
# df['Child injured'] = df['Child injured'].fillna(df['Child injured'].mean())
# df['Max Air Raids'] = df['Max Air Raids'].fillna(df['Max Air Raids'].mean())

In [8]:
df['Civilian Casualties'].value_counts()

0      20667
2        224
5        212
1        199
3        194
       ...  
77         1
101        1
85         1
100        1
95         1
Name: Civilian Casualties, Length: 89, dtype: int64

In [9]:
df['Main category'].value_counts()

Unknown                     8496
Military_Security_Target    7385
Civilian                    2879
Infrastructure              1447
Economic_infrastructure     1080
                            ... 
international_community       13
Political_Tribal              11
International_community        6
cultural_heritage              2
International_Community        1
Name: Main category, Length: 18, dtype: int64

### Clean up columns

#### Column names

In [10]:
# Clean up column names: Remove spaces, make lowercase, replace space with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')

In [11]:
df.columns

Index(['incident_id', 'date', 'governorate', 'district', 'area', 'target',
       'main_category', 'sub_category', 'min_air_raids', 'max_air_raids',
       'civilian_casualties', 'fatalities', 'woman_fatalities',
       'child_fatalities', 'injured', 'woman_injured', 'child_injured',
       'confirmed_time', 'time_of_day'],
      dtype='object')

#### Column values

In [12]:
# date: make index and convert to datetime 
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)
df.index

DatetimeIndex(['2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26',
               ...
               '2020-12-29', '2020-12-30', '2020-12-30', '2020-12-30',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=22485, freq=None)

In [13]:
# incident id

In [14]:
# governorate


In [15]:
# district
district_list = []
for value in df['district']:
    value = str(value).lower().strip()
    district_list.append(value)

df['district'] = district_list

In [16]:
# area


In [17]:
# target


In [18]:
# main_category
main_category_list = []
for value in df['main_category']:
    value = value.lower().strip()
    main_category_list.append(value)
df['main_category'] = main_category_list

In [19]:
# sub-category
sub_category_list = []
for value in df['sub_category']:
    value = value.lower().strip()
    sub_category_list.append(value)
df['sub_category'] = sub_category_list

In [20]:
# min_air_raids


In [21]:
# max_air_raids


In [22]:
# Clean up max_air_raids column:
max_air_raids = []
for value in df['max_air_raids']:
    if value == '2,2':
        val = 2
    elif value != '-' and type(value) != float:
        val = int(value.strip('+'))
    elif type(value) == float:
        val = value
    else:
        val = 0
    max_air_raids.append(val)
df['max_air_raids'] = max_air_raids

In [23]:
# civilian_casualties


In [24]:
# fatalities


In [25]:
# woman_fatalities


In [26]:
# child_fatalities



In [27]:
# injured



In [28]:
# woman_injured



In [29]:
# child_injured



In [30]:
# confirmed_time
df['confirmed_time'] = df['confirmed_time'].replace(
    {'Unkn': 'Unknown', 'unkn': 'Unknown', '41': 'Unknown'})

In [31]:
# time_of_day 
df['time_of_day'] = df['time_of_day'].str.lower()
df['time_of_day'] = df['time_of_day'].replace([' midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['midday'], 'afternoon')
df['time_of_day'] = df['time_of_day'].replace(['evening'], 'night')
df['time_of_day'] = df['time_of_day'].replace(['early-morning'], 'morning')
df['time_of_day'] = df['time_of_day'].replace(['early morning'], 'morning')

In [32]:
df['target'].value_counts()

Unknown                  8364
Pro-Houthi forces        2123
Residential Area          590
Pro-Houthi sites          403
Pro-Houthi positions      396
                         ... 
Civilian's Houses           1
Zatr market                 1
Al-Jawb military camp       1
Al-Majza'ah bridge          1
Civilian's bus              1
Name: target, Length: 2884, dtype: int64

In [47]:
df

Unnamed: 0_level_0,incident_id,governorate,district,area,target,main_category,sub_category,min_air_raids,max_air_raids,civilian_casualties,fatalities,woman_fatalities,child_fatalities,injured,woman_injured,child_injured,confirmed_time,time_of_day
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2015-03-26,1,Capital,bani al-harith,Al-Rahabah,Al-Daylami Airbase,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,2,Early-Morning
2015-03-26,2,Capital,bani al-harith,Airport,Sana'a International Airport,infrastructure,transport,1,2.0,0,0,0,0,0,0,0.0,2,Early-Morning
2015-03-26,3,Capital,bani al-harith,Al-Sonblah Neighbourhood,Residential Area,civilian,residential area,1,2.0,29,21,3,14,8,0,7.0,2,Early-Morning
2015-03-26,4,Capital,al-sab'ein,Al-Nahdain,Presidential Palace,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,Early-Morning
2015-03-26,5,Capital,al-thawrah,Al-Nahdhah,Former 1st Armoured Division,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,3,Early-Morning
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-12-31,22482,Sanaa,sanhan,Raymat Humayd Military Camp,Raymat Humayd Military Camp,military_security_target,military site,1,5.0,0,0,0,0,0,0,0.0,1,Early-Morning
2020-12-31,22483,Sanaa,sanhan,Raymat Humayd Military Camp,Raymat Humayd Military Camp,military_security_target,military site,1,2.0,0,0,0,0,0,0,0.0,2,Early-Morning
2020-12-31,22484,Sanaa,bani hushaysh,Wadi Rijam,Unknown,unknown,unknown,1,2.0,0,0,0,0,0,0,0.0,3,Early-Morning
2020-12-31,22485,Capital,bani al-harith,Ar Rahabah,Al-Daylami Air base,military_security_target,military site,1,1.0,0,0,0,0,0,0,0.0,1,Early-Morning


In [48]:
df[df['target'] == 'Unknown'][['area', 'target', 'district', 'main_category', 'sub_category']]

Unnamed: 0_level_0,area,target,district,main_category,sub_category
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-03-26,"Samir, Shammakh mountains",Unknown,washhah,unknown,unknown
2015-03-26,Al-Malaheet,Unknown,al-dhaher,unknown,unknown
2015-03-26,Dhahyan,Unknown,majz,unknown,unknown
2015-03-26,Al-Naqa'ah,Unknown,al-safra'a,unknown,unknown
2015-03-27,-,Unknown,shada'a,unknown,unknown
...,...,...,...,...,...
2020-12-30,Qaniyah,Unknown,radman,unknown,unknown
2020-12-30,,Unknown,mahalih,unknown,unknown
2020-12-30,,Unknown,al-dhaher,unknown,unknown
2020-12-31,Wadi Rijam,Unknown,bani hushaysh,unknown,unknown


In [None]:
df.set_index('Incident ID')

In [None]:
df.Target.value_counts()

In [None]:
df['Governorate'].value_counts()

In [None]:
df.columns

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
df['District'].value_counts()

In [None]:
df['District'].isnull().sum()

In [None]:
df.head()

In [None]:
pd.set_option('display.max_rows', 500)
df['Area'].value_counts()

In [None]:
df['Area'].isnull().sum()

In [None]:
df['Target'].isnull().sum()

In [None]:
df['Target'].value_counts()

In [None]:
df['Main category'].isnull().sum()

In [None]:
df['Area'].value_counts()

In [None]:
df.head()

In [None]:
df['Date'].isnull().sum()

In [None]:
df['Time of Day'].value_counts()

In [None]:
df.columns

In [None]:
df['Target']