In [3]:
import pandas as pd
#loads the dataset
dataset = pd.read_csv('aviation-accident.csv')
print(dataset.head())


        date             type registration       operator fatalities  \
0  date unk.   Antonov An-12B       T-1206  Indonesian AF        NaN   
1  date unk.   Antonov An-12B       T-1204  Indonesian AF        NaN   
2  date unk.   Antonov An-12B       T-1201  Indonesian AF        NaN   
3  date unk.  Antonov An-12BK          NaN      Soviet AF        NaN   
4  date unk.  Antonov An-12BP   CCCP-11815      Soviet AF          0   

              location          country cat     year  
0                  NaN  Unknown country  U1  unknown  
1                  NaN  Unknown country  U1  unknown  
2                  NaN  Unknown country  U1  unknown  
3  Tiksi Airport (IKS)           Russia  A1  unknown  
4  Massawa Airport ...          Eritrea  A1  unknown  


In [4]:
# attribute selection

print(dataset.columns)

# numeric columns selection for analysis
numericCLs = dataset.select_dtypes(include='number')
print(numericCLs.head())


Index(['date', 'type', 'registration', 'operator', 'fatalities', 'location',
       'country', 'cat', 'year'],
      dtype='object')
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3, 4]


In [11]:
# attribute transformation: empty logs disposal
print("ATTRIBUTES WITH EMPTY LOGS:")
print(dataset.isnull().sum())

#deletes logs with any empty fields to avoid dataset corruption
df = dataset.dropna()

# final check
print("EMPTY LOGS:")
print(df.isnull().sum())
print("REMAINING LOGS:")
print(df.shape)


ATTRIBUTES WITH EMPTY LOGS:
date               0
type               0
registration    1552
operator           4
fatalities      3982
location         958
country            0
cat                0
year               0
dtype: int64
EMPTY LOGS:
date            0
type            0
registration    0
operator        0
fatalities      0
location        0
country         0
cat             0
year            0
dtype: int64
REMAINING LOGS:
(18543, 9)


In [15]:
# New attribute creation: accident severity

# sive realized the fatalities column contains some string values, I need to discard them
dataset['fatalities'] = pd.to_numeric(dataset['fatalities'], errors='coerce')

dataset['Severity'] = dataset['fatalities'].apply(
    lambda x: 'Severe' if x > 10 else 'Moderate' if x > 0 else 'None'
)
#visualize the columns to check if the new column has indeed been created
print(dataset.head(10))

        date             type registration  \
0  date unk.   Antonov An-12B       T-1206   
1  date unk.   Antonov An-12B       T-1204   
2  date unk.   Antonov An-12B       T-1201   
3  date unk.  Antonov An-12BK          NaN   
4  date unk.  Antonov An-12BP   CCCP-11815   
5  date unk.  Antonov An-12BP   CCCP-12172   
6  date unk.     Antonov An-2    CCCP-N574   
7  date unk.     Antonov An-2   CCCP-01216   
8  date unk.   Antonov An-24B     RA-47794   
9  date unk.    Antonov An-26       01 red   

                               operator  fatalities             location  \
0                         Indonesian AF         NaN                  NaN   
1                         Indonesian AF         NaN                  NaN   
2                         Indonesian AF         NaN                  NaN   
3                             Soviet AF         NaN  Tiksi Airport (IKS)   
4                             Soviet AF         0.0  Massawa Airport ...   
5                             Soviet 