## Data Cleaning 


### Imported Libraries

> NumPy : Library for Numeric Computations in Python  
> Pandas : Library for Data Acquisition and Preparation  
> Matplotlib : Low-level library for Data Visualization  
> Seaborn : Higher-level library for Data Visualization  

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

### Import the Dataset

In [2]:
aviationdata = pd.read_csv('../Data/AviationDataUP.csv')
aviationdata.head()

Unnamed: 0,Event.Id,Investigation.Type,Accident.Number,Event.Date,Location,Country,Latitude,Longitude,Airport.Code,Airport.Name,...,Purpose.of.Flight,Air.Carrier,Total.Fatal.Injuries,Total.Serious.Injuries,Total.Minor.Injuries,Total.Uninjured,Weather.Condition,Broad.Phase.of.Flight,Report.Status,Publication.Date
0,20161117X64217,Accident,WPR17WA022,14/11/2016,"Winton, New Zealand",New Zealand,,,,,...,,,,,,1.0,,MANEUVERING,Foreign,
1,20161116X13203,Accident,CEN17WA040,14/11/2016,"Albrook, Panama",Panama,,,MPMG,Marcos A. Gelabert Intl.,...,,,,,,2.0,VMC,LANDING,Foreign,
2,20161116X62135,Accident,CEN17LA041,13/11/2016,"Port Allen, LA",United States,30.5325,-91.315,LA46,OMNI,...,Personal,,,,,1.0,VMC,TAKEOFF,Preliminary,22/11/2016
3,20161114X41943,Accident,WPR17LA020,12/11/2016,"Taylor, AZ",United States,34.454722,-110.0575,,,...,Instructional,,,,,2.0,VMC,MANEUVERING,Preliminary,22/11/2016
4,20161116X04207,Accident,CEN17LA039,12/11/2016,"Pleasanton, TX",United States,,,,,...,Personal,,,1.0,,1.0,VMC,,Preliminary,16/11/2016


### Rename columns, removing '.'

In [3]:
aviationdata.columns = ['EventID', 'InvestigationType', 'AccidentNumber', 'EventDate', 'Location', 'Country', 'Latitude',
                       'Longitude', 'AirportCode', 'AirportName', 'InjurySeverity', 'AircraftDamage', 'AircraftCategory',
                       'RegistrationNumber', 'Make', 'Model', 'AmateurBuilt', 'NumberOfEngines', 'EngineType', 'FARDescription'
                       , 'Schedule', 'PurposeOfFlight', 'AirCarrier', 'TotalFatalInjuries', 'TotalSeriousInjuries',
                       'TotalMinorInjuries', 'TotalUninjured', 'WeatherCondition', 'BroadPhaseOfFlight', 'ReportStatus',
                       'PublicationDate']
aviationdata.head()

Unnamed: 0,EventID,InvestigationType,AccidentNumber,EventDate,Location,Country,Latitude,Longitude,AirportCode,AirportName,...,PurposeOfFlight,AirCarrier,TotalFatalInjuries,TotalSeriousInjuries,TotalMinorInjuries,TotalUninjured,WeatherCondition,BroadPhaseOfFlight,ReportStatus,PublicationDate
0,20161117X64217,Accident,WPR17WA022,14/11/2016,"Winton, New Zealand",New Zealand,,,,,...,,,,,,1.0,,MANEUVERING,Foreign,
1,20161116X13203,Accident,CEN17WA040,14/11/2016,"Albrook, Panama",Panama,,,MPMG,Marcos A. Gelabert Intl.,...,,,,,,2.0,VMC,LANDING,Foreign,
2,20161116X62135,Accident,CEN17LA041,13/11/2016,"Port Allen, LA",United States,30.5325,-91.315,LA46,OMNI,...,Personal,,,,,1.0,VMC,TAKEOFF,Preliminary,22/11/2016
3,20161114X41943,Accident,WPR17LA020,12/11/2016,"Taylor, AZ",United States,34.454722,-110.0575,,,...,Instructional,,,,,2.0,VMC,MANEUVERING,Preliminary,22/11/2016
4,20161116X04207,Accident,CEN17LA039,12/11/2016,"Pleasanton, TX",United States,,,,,...,Personal,,,1.0,,1.0,VMC,,Preliminary,16/11/2016


In [4]:
print(aviationdata.dtypes)

EventID                  object
InvestigationType        object
AccidentNumber           object
EventDate                object
Location                 object
Country                  object
Latitude                float64
Longitude               float64
AirportCode              object
AirportName              object
InjurySeverity           object
AircraftDamage           object
AircraftCategory         object
RegistrationNumber       object
Make                     object
Model                    object
AmateurBuilt             object
NumberOfEngines         float64
EngineType               object
FARDescription           object
Schedule                 object
PurposeOfFlight          object
AirCarrier               object
TotalFatalInjuries      float64
TotalSeriousInjuries    float64
TotalMinorInjuries      float64
TotalUninjured          float64
WeatherCondition         object
BroadPhaseOfFlight       object
ReportStatus             object
PublicationDate          object
dtype: o

In [5]:
# Check shape of data
aviationdata.shape

(79141, 31)

### Filling in null values
> Only for important columns

#### Fill in null values in WeatherCondition with 'UNK',  BroadPhaseOfFlight with 'UNKNOWN', AircraftDamage with 'Unknown', PurposeOfFlight with 'Unknown', EngineType with 'Unknown'

In [6]:
aviationdata["WeatherCondition"].fillna("UNK", inplace = True)
aviationdata["BroadPhaseOfFlight"].fillna("UNKNOWN", inplace = True) 
aviationdata["AircraftDamage"].fillna("Unknown", inplace = True) 
aviationdata["PurposeOfFlight"].fillna("Unknown", inplace = True) 
aviationdata["EngineType"].fillna("Unknown", inplace = True) 

#### Fill in null values in TotalFatalInjuries, TotalSeriousInjuries, TotalMinorInjuries, TotalUninjured with 0

In [7]:
aviationdata["TotalFatalInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalSeriousInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalMinorInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalUninjured"].fillna(value = 0, inplace = True) 

### Create new columns

#### TotalInjuries = Fatal+Serious+Minor

In [8]:
aviationdata["TotalInjuries"] = aviationdata["TotalFatalInjuries"] + aviationdata["TotalSeriousInjuries"] 
+ aviationdata["TotalMinorInjuries"]

0        0.0
1        0.0
2        0.0
3        0.0
4        0.0
        ... 
79136    0.0
79137    0.0
79138    0.0
79139    0.0
79140    0.0
Name: TotalMinorInjuries, Length: 79141, dtype: float64

#### Extract month out of EventDate

In [10]:
aviationdata["EventDate"] = pd.to_datetime(df['Date'], errors='coerce')
aviationdata["Month"] = aviationdata["EventDate"].dt.month

AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
aviationdata.describe()

### Check for null values

In [None]:
print(aviationdata.isnull().any(), "\n")

### Extracting categorical data
Converting object columns to category.

In [None]:
for col in aviationdata:
    if aviationdata[col].dtype == 'object':
        aviationdata[col] = aviationdata[col].astype('category')

In [None]:
# Checking types
print(aviationdata.dtypes)

### Creating new dataset with needed columns

In [None]:
aviation = pd.DataFrame(aviationdata[["WeatherCondition", "BroadPhaseOfFlight", "AircraftDamage", "PurposeOfFlight",
                                     "EngineType", "TotalFatalInjuries", "TotalSeriousInjuries", "TotalMinorInjuries",
                                     "TotalUninjured", "TotalInjuries", "EventDate", "Month"]])

### Final Dataset

In [None]:
aviation

### Export dataset as .csv file

In [None]:
aviation.to_csv("../Data/AviationFinal.csv", index = 0)