## Project 5 - Data Cleaning

### Import libraries

In [53]:
import pandas as pd
pd.set_option('display.max_rows', 10)

### Load data

In [54]:
df = pd.read_csv('yemen.csv', encoding='cp1252')

### Checkout null values

In [55]:
df.isnull().sum()

Incident ID        0
Date               0
Governorate        0
District         238
Area            3299
               ...  
Time of Day        0
Unnamed: 19    22360
Unnamed: 20    22360
Unnamed: 21    22360
Unnamed: 22    22436
Length: 23, dtype: int64

**Drop Columns:** 
- Unnamed: 19 
- Unnamed: 20 
- Unnamed: 21 
- Unnamed: 22 

These columns are mainly missing values and the information in the non-missing cells is duplicated in other columns.

In [56]:
df.drop(columns = ['Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22'], inplace=True)

In [57]:
len(df['Area'].unique())

5805

In [58]:
#df = df.drop(columns=['Unnamed: 19', 'Unnamed: 20',
#       'Unnamed: 21', 'Unnamed: 22'])

In [59]:
# df['Child injured'] = df['Child injured'].fillna(df['Child injured'].mean())
# df['Max Air Raids'] = df['Max Air Raids'].fillna(df['Max Air Raids'].mean())

In [60]:
df['Civilian Casualties'].value_counts()

0      20667
2        224
5        212
1        199
3        194
       ...  
77         1
101        1
85         1
100        1
95         1
Name: Civilian Casualties, Length: 89, dtype: int64

In [61]:
df['Main category'].value_counts()

Unknown                     8496
Military_Security_Target    7385
Civilian                    2879
Infrastructure              1447
Economic_infrastructure     1080
                            ... 
international_community       13
Political_Tribal              11
International_community        6
cultural_heritage              2
International_Community        1
Name: Main category, Length: 18, dtype: int64

### Clean up columns

#### Column names

In [62]:
# Clean up column names: Remove spaces, make lowercase, replace space with underscore
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-','_')

In [63]:
df.columns

Index(['incident_id', 'date', 'governorate', 'district', 'area', 'target',
       'main_category', 'sub_category', 'min_air_raids', 'max_air_raids',
       'civilian_casualties', 'fatalities', 'woman_fatalities',
       'child_fatalities', 'injured', 'woman_injured', 'child_injured',
       'confirmed_time', 'time_of_day'],
      dtype='object')

#### Column values

In [64]:
# date: make index and convert to datetime 
df.set_index('date', inplace=True)
df.index = pd.to_datetime(df.index)
df.index

DatetimeIndex(['2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26', '2015-03-26', '2015-03-26',
               '2015-03-26', '2015-03-26',
               ...
               '2020-12-29', '2020-12-30', '2020-12-30', '2020-12-30',
               '2020-12-31', '2020-12-31', '2020-12-31', '2020-12-31',
               '2020-12-31', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=22485, freq=None)

In [None]:
# incident id

In [None]:
# governorate


In [None]:
# district


In [None]:
# area


In [None]:
# target


In [None]:
# main_category


In [None]:
# sub-category
sub_category_list = []
for value in df['sub_category']:
    value = value.lower().strip()
    sub_category_list.append(value)
df['sub_category'] = sub_category_list
main_category_list = []
for value in df['main_category']:
    value = value.lower().strip()
    main_category_list.append(value)
df['main_category'] = main_category_list

In [None]:
# min_air_raids


In [None]:
# max_air_raids


In [32]:
# Clean up max_air_raids column:
max_air_raids = []
for value in df['max_air_raids']:
    if value == '2,2':
        val = 2
    elif value != '-' and type(value) != float:
        val = int(value.strip('+'))
    elif type(value) == float:
        val = value
    else:
        val = 0
    max_air_raids.append(val)
df['max_air_raids'] = max_air_raids

In [None]:
# civilian_casualties


In [None]:
# fatalities


In [None]:
# woman_fatalities


In [None]:
# child_fatalities



In [None]:
# injured



In [None]:
# woman_injured



In [None]:
# child_injured



In [None]:
# confirmed_time



In [None]:
# time_of_day 



In [34]:
df['target'].value_counts()

Unknown                      8364
Pro-Houthi forces            2123
Residential Area              590
Pro-Houthi sites              403
Pro-Houthi positions          396
                             ... 
Hammam Qama'ah                  1
Al-Fawaz school                 1
Hawari bridge                   1
Truck carrying weapons          1
Al-Sonainah old cementery       1
Name: target, Length: 2884, dtype: int64

In [None]:
df['Sub-category'].value_counts()

In [None]:
df[df['Target'] == 'Unknown']

In [None]:
df.set_index('Incident ID')

In [None]:
df.Target.value_counts()

In [None]:
df['Governorate'].value_counts()

In [None]:
df.columns

In [None]:
pd.set_option('display.max_rows', 500)

In [None]:
df['District'].value_counts()

In [None]:
df['District'].isnull().sum()

In [None]:
df.head()

In [None]:
pd.set_option('display.max_rows', 500)
df['Area'].value_counts()

In [None]:
df['Area'].isnull().sum()

In [None]:
df['Target'].isnull().sum()

In [None]:
df['Target'].value_counts()

In [None]:
df['Main category'].isnull().sum()

In [None]:
df['Area'].value_counts()

In [None]:
df.head()

In [None]:
df['Date'].isnull().sum()

In [None]:
df['Time of Day'] = df['Time of Day'].str.lower()

In [None]:
df['Time of Day'] = df['Time of Day'].replace([' midday'], 'afternoon')

In [None]:
df['Time of Day'] = df['Time of Day'].replace(['midday'], 'afternoon')

In [None]:
df['Time of Day'] = df['Time of Day'].replace(['evening'], 'night')

In [None]:
df['Time of Day'] = df['Time of Day'].replace(['early-morning'], 'morning')

In [None]:
df['Time of Day'] = df['Time of Day'].replace(['early morning'], 'morning')

In [None]:
df['Time of Day'].value_counts()

In [None]:
df.columns

In [None]:
df['Target']