## Data Cleaning 


### Imported Libraries

> NumPy : Library for Numeric Computations in Python  
> Pandas : Library for Data Acquisition and Preparation  
> Matplotlib : Low-level library for Data Visualization  
> Seaborn : Higher-level library for Data Visualization  

In [None]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
sb.set()

### Import the Dataset

In [None]:
aviationdata = pd.read_csv('../Data/AviationDataUP.csv')
aviationdata.head()

### Rename columns, removing '.'

In [None]:
aviationdata.columns = ['EventID', 'InvestigationType', 'AccidentNumber', 'EventDate', 'Location', 'Country', 'Latitude',
                       'Longitude', 'AirportCode', 'AirportName', 'InjurySeverity', 'AircraftDamage', 'AircraftCategory',
                       'RegistrationNumber', 'Make', 'Model', 'AmateurBuilt', 'NumberOfEngines', 'EngineType', 'FARDescription'
                       , 'Schedule', 'PurposeOfFlight', 'AirCarrier', 'TotalFatalInjuries', 'TotalSeriousInjuries',
                       'TotalMinorInjuries', 'TotalUninjured', 'WeatherCondition', 'BroadPhaseOfFlight', 'ReportStatus',
                       'PublicationDate']
aviationdata.head()

In [None]:
print(aviationdata.dtypes)

In [None]:
# Check shape of data
aviationdata.shape

### Filling in null values
> Only for important columns

#### Fill in null values in WeatherCondition with 'UNK',  BroadPhaseOfFlight with 'UNKNOWN', AircraftDamage with 'Unknown', PurposeOfFlight with 'Unknown', EngineType with 'Unknown'

In [None]:
aviationdata["WeatherCondition"].fillna("UNK", inplace = True)
aviationdata["BroadPhaseOfFlight"].fillna("UNKNOWN", inplace = True) 
aviationdata["AircraftDamage"].fillna("Unknown", inplace = True) 
aviationdata["PurposeOfFlight"].fillna("Unknown", inplace = True) 
aviationdata["EngineType"].fillna("Unknown", inplace = True) 

#### Fill in null values in TotalFatalInjuries, TotalSeriousInjuries, TotalMinorInjuries, TotalUninjured with 0

In [None]:
aviationdata["TotalFatalInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalSeriousInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalMinorInjuries"].fillna(value = 0, inplace = True) 
aviationdata["TotalUninjured"].fillna(value = 0, inplace = True) 

### Create new columns

#### TotalInjuries = Fatal+Serious+Minor

In [None]:
aviationdata["TotalInjuries"] = aviationdata["TotalFatalInjuries"] + aviationdata["TotalSeriousInjuries"] 
+ aviationdata["TotalMinorInjuries"]

#### Extract month out of EventDate

In [None]:
aviationdata["Month"] = aviationdata["EventDate"].dt.month

In [None]:
aviationdata.describe()

### Check for null values

In [None]:
print(aviationdata.isnull().any(), "\n")

### Extracting categorical data
Converting object columns to category.

In [None]:
for col in aviationdata:
    if aviationdata[col].dtype == 'object':
        aviationdata[col] = aviationdata[col].astype('category')

In [None]:
# Checking types
print(aviationdata.dtypes)

### Creating new dataset with needed columns

In [None]:
aviation = pd.DataFrame(aviationdata[["WeatherCondition", "BroadPhaseOfFlight", "AircraftDamage", "PurposeOfFlight",
                                     "EngineType", "TotalFatalInjuries", "TotalSeriousInjuries", "TotalMinorInjuries",
                                     "TotalUninjured", "TotalInjuries", "EventDate", "Month"]])

### Final Dataset

In [None]:
aviation

### Export dataset as .csv file

In [None]:
aviation.to_csv("../Data/AviationFinal.csv", index = 0)