In [37]:
import ast
import nltk
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
pd.options.mode.chained_assignment = None

In [38]:
url = 'https://en.wikipedia.org/wiki/List_of_fatal_accidents_and_incidents_involving_commercial_aircraft_in_the_United_States'

In [39]:
df = pd.read_html(url)

In [40]:
df = df[0]
df = pd.DataFrame(df)

In [41]:
df.head(10)

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
0,"January 1, 2024",1,0,0,2024 Salt Lake City passenger death incident,Salt Lake City,Utah,Airbus A220-100,A passenger entered the airport grounds unauth...
1,"June 23, 2023",1,0,0,2023 San Antonio ground crew suicide incident,San Antonio,Texas,Airbus A319,A ground crewman was ingested into an engine o...
2,"December 31, 2022",1,0,0,2022 Montgomery ground crew incident,Montgomery,Alabama,Embraer 175,An airline worker was pulled into the engine o...
3,"September 4, 2022",10,0,0,2022 Mutiny Bay DHC-3 Otter crash,Mutiny Bay near Whidbey Island,Washington,de Havilland Canada DHC-3 Otter,"Carrying tourists, the seaplane nosedived and ..."
4,"May 7, 2020",1,0,58,Southwest Airlines Flight 1392,Austin,Texas,Boeing 737-700,A man who had illegally entered the airfield w...
5,"January 26, 2020",9,0,0,2020 Calabasas helicopter crash,Calabasas,California,Sikorsky S-76B,The chartered helicopter flight crashed in hea...
6,"October 17, 2019",1,12,41,PenAir Flight 3296,Amaknak Island,Alaska,Saab 2000,The aircraft overshot the runway during landin...
7,"May 13, 2019",6,10,10,2019 Alaska mid-air collision,George Inlet,Alaska,"de Havilland Canada DHC-2 Beaver, de Havilland...",Two commercial floatplanes conducting sightsee...
8,"February 23, 2019",3,0,0,Atlas Air Flight 3591,"Trinity Bay, near Anahuac",Texas,Boeing 767-300ER,The cargo aircraft crashed during final approach.
9,"August 10, 2018",1,0,0,2018 Horizon Air Q400 incident,Ketron Island,Washington,Bombardier Dash 8 Q400,A Horizon Air employee stole an aircraft and c...


In [42]:
df.drop(axis=1, labels='Flight(s) or incident')

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Location,State or territory,Aircraft,Summary
0,"January 1, 2024",1,0,0,Salt Lake City,Utah,Airbus A220-100,A passenger entered the airport grounds unauth...
1,"June 23, 2023",1,0,0,San Antonio,Texas,Airbus A319,A ground crewman was ingested into an engine o...
2,"December 31, 2022",1,0,0,Montgomery,Alabama,Embraer 175,An airline worker was pulled into the engine o...
3,"September 4, 2022",10,0,0,Mutiny Bay near Whidbey Island,Washington,de Havilland Canada DHC-3 Otter,"Carrying tourists, the seaplane nosedived and ..."
4,"May 7, 2020",1,0,58,Austin,Texas,Boeing 737-700,A man who had illegally entered the airfield w...
...,...,...,...,...,...,...,...,...
267,"May 6, 1935",5,8,8,near Atlanta,Missouri,Douglas DC-2,The aircraft crashed into terrain due to low v...
268,"February 23, 1934",8,0,0,"Wasatch Mountains, 35 miles east of Salt Lake ...",Utah,Boeing 247,The aircraft crashed into terrain in poor weat...
269,"October 10, 1933",7,0,0,near Chesterton,Indiana,Boeing 247D,The aircraft suffered a mid-air explosion due ...
270,"March 31, 1931",8,0,0,Bazaar Township,Kansas,Fokker F-10,The wooden wing separated from the body of the...


In [43]:
df['Summary']

0      A passenger entered the airport grounds unauth...
1      A ground crewman was ingested into an engine o...
2      An airline worker was pulled into the engine o...
3      Carrying tourists, the seaplane nosedived and ...
4      A man who had illegally entered the airfield w...
                             ...                        
267    The aircraft crashed into terrain due to low v...
268    The aircraft crashed into terrain in poor weat...
269    The aircraft suffered a mid-air explosion due ...
270    The wooden wing separated from the body of the...
271    The aircraft caught fire and crashed, killing ...
Name: Summary, Length: 272, dtype: object

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Date                   272 non-null    object
 1   Fatalities             272 non-null    object
 2   Injuries               268 non-null    object
 3   Survivors              272 non-null    int64 
 4   Flight(s) or incident  272 non-null    object
 5   Location               272 non-null    object
 6   State or territory     272 non-null    object
 7   Aircraft               272 non-null    object
 8   Summary                272 non-null    object
dtypes: int64(1), object(8)
memory usage: 19.3+ KB


In [45]:
from datetime import datetime as dt

In [46]:
df.describe()

Unnamed: 0,Survivors
count,272.0
mean,19.503676
std,48.236502
min,0.0
25%,0.0
50%,0.0
75%,13.0
max,346.0


In [47]:
df.head()

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
0,"January 1, 2024",1,0,0,2024 Salt Lake City passenger death incident,Salt Lake City,Utah,Airbus A220-100,A passenger entered the airport grounds unauth...
1,"June 23, 2023",1,0,0,2023 San Antonio ground crew suicide incident,San Antonio,Texas,Airbus A319,A ground crewman was ingested into an engine o...
2,"December 31, 2022",1,0,0,2022 Montgomery ground crew incident,Montgomery,Alabama,Embraer 175,An airline worker was pulled into the engine o...
3,"September 4, 2022",10,0,0,2022 Mutiny Bay DHC-3 Otter crash,Mutiny Bay near Whidbey Island,Washington,de Havilland Canada DHC-3 Otter,"Carrying tourists, the seaplane nosedived and ..."
4,"May 7, 2020",1,0,58,Southwest Airlines Flight 1392,Austin,Texas,Boeing 737-700,A man who had illegally entered the airfield w...


In [48]:
for date in df['Date']:
    try:
        date = dt.strptime(date, '%B %d, %Y')
    except:
        date = dt.strptime(date, '%B %d , %Y')
        

In [49]:
df['Date'][23] = 'January 8, 2003'

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Date'][23] = 'January 8, 2003'


In [50]:
df['Date'] = pd.to_datetime(df['Date'], format='%B %d, %Y')

In [51]:
df['Date']

0     2024-01-01
1     2023-06-23
2     2022-12-31
3     2022-09-04
4     2020-05-07
         ...    
267   1935-05-06
268   1934-02-23
269   1933-10-10
270   1931-03-31
271   1919-07-21
Name: Date, Length: 272, dtype: datetime64[ns]

In [52]:
for item in df['Fatalities']:
    if len(item) > 3:
        print(item)

c. 679 (2,763 total combined with American Airlines Flight 11)
c. 1,700 (2,763 total combined with United Airlines Flight 175)


In [53]:
#plt.plot(df['Date'], df['Fatalities'])
df.loc[df['Date'] == '2001-09-11']

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
25,2001-09-11,44,0,0,United Airlines Flight 93,Shanksville,Pennsylvania,Boeing 757-222,One of four aircraft involved in the September...
26,2001-09-11,189,106,0,American Airlines Flight 77,Arlington,Virginia,Boeing 757-223,One of four aircraft involved in the September...
27,2001-09-11,"c. 679 (2,763 total combined with American Air...","c. 6,000 – c. 25,000 (combined with American A...",0,United Airlines Flight 175,New York City,New York,Boeing 767-200,One of four aircraft involved in the September...
28,2001-09-11,"c. 1,700 (2,763 total combined with United Air...","c. 6,000 – c. 25,000 (combined with United Air...",0,American Airlines Flight 11,New York City,New York,Boeing 767-200ER,One of four aircraft involved in the September...


In [54]:
df['Fatalities'][27] = 679
df['Fatalities'][28] = 1700


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df['Fatalities'][27] = 679
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, beca

In [55]:
df.loc[27, 'Injuries'] = 0
df.loc[28, 'Injuries'] = 0

df.loc[df['Date'] == '2001-09-11']


Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
25,2001-09-11,44,0,0,United Airlines Flight 93,Shanksville,Pennsylvania,Boeing 757-222,One of four aircraft involved in the September...
26,2001-09-11,189,106,0,American Airlines Flight 77,Arlington,Virginia,Boeing 757-223,One of four aircraft involved in the September...
27,2001-09-11,679,0,0,United Airlines Flight 175,New York City,New York,Boeing 767-200,One of four aircraft involved in the September...
28,2001-09-11,1700,0,0,American Airlines Flight 11,New York City,New York,Boeing 767-200ER,One of four aircraft involved in the September...


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   272 non-null    datetime64[ns]
 1   Fatalities             272 non-null    object        
 2   Injuries               268 non-null    object        
 3   Survivors              272 non-null    int64         
 4   Flight(s) or incident  272 non-null    object        
 5   Location               272 non-null    object        
 6   State or territory     272 non-null    object        
 7   Aircraft               272 non-null    object        
 8   Summary                272 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 19.3+ KB


In [57]:
df.loc[90, 'Injuries'] = 5


In [58]:
df['Injuries'] = df['Injuries'].fillna(0)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   272 non-null    datetime64[ns]
 1   Fatalities             272 non-null    object        
 2   Injuries               272 non-null    object        
 3   Survivors              272 non-null    int64         
 4   Flight(s) or incident  272 non-null    object        
 5   Location               272 non-null    object        
 6   State or territory     272 non-null    object        
 7   Aircraft               272 non-null    object        
 8   Summary                272 non-null    object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 19.3+ KB


In [60]:
df['Fatalities'] = pd.to_numeric(df['Fatalities'])
df['Injuries'] = pd.to_numeric(df['Injuries'])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 272 entries, 0 to 271
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date                   272 non-null    datetime64[ns]
 1   Fatalities             272 non-null    int64         
 2   Injuries               272 non-null    int64         
 3   Survivors              272 non-null    int64         
 4   Flight(s) or incident  272 non-null    object        
 5   Location               272 non-null    object        
 6   State or territory     272 non-null    object        
 7   Aircraft               272 non-null    object        
 8   Summary                272 non-null    object        
dtypes: datetime64[ns](1), int64(3), object(5)
memory usage: 19.3+ KB


In [62]:
df.describe()

Unnamed: 0,Date,Fatalities,Injuries,Survivors
count,272,272.0,272.0,272.0
mean,1971-09-17 15:52:56.470588232,43.819853,8.628676,19.503676
min,1919-07-21 00:00:00,1.0,0.0,0.0
25%,1955-10-25 12:00:00,7.0,0.0,0.0
50%,1968-12-25 12:00:00,21.0,0.0,0.0
75%,1987-11-20 12:00:00,46.25,8.0,13.0
max,2024-01-01 00:00:00,1700.0,187.0,346.0
std,,116.605149,22.096359,48.236502


In [63]:
df

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
0,2024-01-01,1,0,0,2024 Salt Lake City passenger death incident,Salt Lake City,Utah,Airbus A220-100,A passenger entered the airport grounds unauth...
1,2023-06-23,1,0,0,2023 San Antonio ground crew suicide incident,San Antonio,Texas,Airbus A319,A ground crewman was ingested into an engine o...
2,2022-12-31,1,0,0,2022 Montgomery ground crew incident,Montgomery,Alabama,Embraer 175,An airline worker was pulled into the engine o...
3,2022-09-04,10,0,0,2022 Mutiny Bay DHC-3 Otter crash,Mutiny Bay near Whidbey Island,Washington,de Havilland Canada DHC-3 Otter,"Carrying tourists, the seaplane nosedived and ..."
4,2020-05-07,1,0,58,Southwest Airlines Flight 1392,Austin,Texas,Boeing 737-700,A man who had illegally entered the airfield w...
...,...,...,...,...,...,...,...,...,...
267,1935-05-06,5,8,8,TWA Flight 6,near Atlanta,Missouri,Douglas DC-2,The aircraft crashed into terrain due to low v...
268,1934-02-23,8,0,0,1934 United Airlines Boeing 247 crash,"Wasatch Mountains, 35 miles east of Salt Lake ...",Utah,Boeing 247,The aircraft crashed into terrain in poor weat...
269,1933-10-10,7,0,0,1933 United Airlines Boeing 247 mid-air explosion,near Chesterton,Indiana,Boeing 247D,The aircraft suffered a mid-air explosion due ...
270,1931-03-31,8,0,0,1931 Transcontinental & Western Air Fokker F-1...,Bazaar Township,Kansas,Fokker F-10,The wooden wing separated from the body of the...


In [64]:
df['State or territory'].value_counts()

State or territory
California              28
New York                28
Pennsylvania            14
Alaska                  12
Florida                 12
Texas                   11
Illinois                11
Virginia                 9
Michigan                 8
Kentucky                 8
Missouri                 8
Massachusetts            8
Utah                     7
Alabama                  7
Colorado                 7
Hawaii                   6
North Carolina           6
Indiana                  5
New Jersey               5
Puerto Rico              5
Nevada                   4
Washington               4
Maryland                 4
Oregon                   4
Tennessee                4
Ohio                     4
Georgia                  4
Wisconsin                3
Maine                    3
Arkansas                 3
Minnesota                3
Iowa                     2
West Virginia            2
New Mexico               2
American Samoa           2
Nebraska                 2
Wyoming  

In [65]:
df.head()

Unnamed: 0,Date,Fatalities,Injuries,Survivors,Flight(s) or incident,Location,State or territory,Aircraft,Summary
0,2024-01-01,1,0,0,2024 Salt Lake City passenger death incident,Salt Lake City,Utah,Airbus A220-100,A passenger entered the airport grounds unauth...
1,2023-06-23,1,0,0,2023 San Antonio ground crew suicide incident,San Antonio,Texas,Airbus A319,A ground crewman was ingested into an engine o...
2,2022-12-31,1,0,0,2022 Montgomery ground crew incident,Montgomery,Alabama,Embraer 175,An airline worker was pulled into the engine o...
3,2022-09-04,10,0,0,2022 Mutiny Bay DHC-3 Otter crash,Mutiny Bay near Whidbey Island,Washington,de Havilland Canada DHC-3 Otter,"Carrying tourists, the seaplane nosedived and ..."
4,2020-05-07,1,0,58,Southwest Airlines Flight 1392,Austin,Texas,Boeing 737-700,A man who had illegally entered the airfield w...


In [79]:
df.Summary.iloc[3]

'Carrying tourists, the seaplane nosedived and crashed for unknown and undetermined reasons.'

In [66]:
df.set_index(['Date'], inplace=True)

In [68]:
parser = PorterStemmer()

In [69]:
def stem(text):
    y = []
    for i in text.split():
        y.append(parser.stem(i))
    return ' '.join(y)

In [70]:
df_sum = df['Summary'].apply(lambda x: x.lower())

In [71]:
df_sum = df['Summary'].apply(stem)

In [72]:
df_sum

Date
2024-01-01    a passeng enter the airport ground unauthor fr...
2023-06-23    a ground crewman wa ingest into an engin of th...
2022-12-31    an airlin worker wa pull into the engin of the...
2022-09-04    carri tourists, the seaplan nosediv and crash ...
2020-05-07    a man who had illeg enter the airfield wa stru...
                                    ...                        
1935-05-06    the aircraft crash into terrain due to low vis...
1934-02-23     the aircraft crash into terrain in poor weather.
1933-10-10    the aircraft suffer a mid-air explos due to a ...
1931-03-31    the wooden wing separ from the bodi of the air...
1919-07-21    the aircraft caught fire and crashed, kill 3 a...
Name: Summary, Length: 272, dtype: object

In [73]:
cv = CountVectorizer(max_features = 5000, stop_words='english')

In [74]:
vectors = cv.fit_transform(df_sum).toarray()

In [75]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [76]:
df_sum[0]

  df_sum[0]


'a passeng enter the airport ground unauthor from the termin and climb insid the engin of delta air line flight 2348, which wa be de-ic prior to departure.'