In [1]:
import pandas as pd
import datetime

In [2]:
df = pd.read_csv('Resources/us_tornado_dataset_1950_2021.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67558 entries, 0 to 67557
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   yr      67558 non-null  int64  
 1   mo      67558 non-null  int64  
 2   dy      67558 non-null  int64  
 3   date    67558 non-null  object 
 4   st      67558 non-null  object 
 5   mag     67558 non-null  int64  
 6   inj     67558 non-null  int64  
 7   fat     67558 non-null  int64  
 8   slat    67558 non-null  float64
 9   slon    67558 non-null  float64
 10  elat    67558 non-null  float64
 11  elon    67558 non-null  float64
 12  len     67558 non-null  float64
 13  wid     67558 non-null  int64  
dtypes: float64(5), int64(7), object(2)
memory usage: 7.2+ MB


In [4]:
# df['date'] = pd.to_datetime(df['date'], format='%m/%d/%y')
# df.info()

df["date"] = pd.to_datetime(df["date"], format="%m/%d/%y")

df["date"] = df["date"].apply(lambda x: x - 
                              pd.DateOffset(years=100) if 
                              x.year >= 2050 and x.year < 2100 else x)

df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67558 entries, 0 to 67557
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   yr      67558 non-null  int64         
 1   mo      67558 non-null  int64         
 2   dy      67558 non-null  int64         
 3   date    67558 non-null  datetime64[ns]
 4   st      67558 non-null  object        
 5   mag     67558 non-null  int64         
 6   inj     67558 non-null  int64         
 7   fat     67558 non-null  int64         
 8   slat    67558 non-null  float64       
 9   slon    67558 non-null  float64       
 10  elat    67558 non-null  float64       
 11  elon    67558 non-null  float64       
 12  len     67558 non-null  float64       
 13  wid     67558 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(7), object(1)
memory usage: 7.2+ MB


In [5]:
df.head()

Unnamed: 0,yr,mo,dy,date,st,mag,inj,fat,slat,slon,elat,elon,len,wid
0,1950,1,3,1950-01-03,IL,3,3,0,39.1,-89.3,39.12,-89.23,3.6,130
1,1950,1,3,1950-01-03,MO,3,3,0,38.77,-90.22,38.83,-90.03,9.5,150
2,1950,1,3,1950-01-03,OH,1,1,0,40.88,-84.58,0.0,0.0,0.1,10
3,1950,1,13,1950-01-13,AR,3,1,1,34.4,-94.37,0.0,0.0,0.6,17
4,1950,1,25,1950-01-25,IL,2,0,0,41.17,-87.33,0.0,0.0,0.1,100


In [6]:
#Checking to see if we have any null data
print(df.isna().any().any())

False


In [7]:
# Dropping the columns we don't need
df.drop(columns=df.columns[[1,2,12]], axis=1, inplace=True)

# Display DataFrame
df.head()

Unnamed: 0,yr,date,st,mag,inj,fat,slat,slon,elat,elon,wid
0,1950,1950-01-03,IL,3,3,0,39.1,-89.3,39.12,-89.23,130
1,1950,1950-01-03,MO,3,3,0,38.77,-90.22,38.83,-90.03,150
2,1950,1950-01-03,OH,1,1,0,40.88,-84.58,0.0,0.0,10
3,1950,1950-01-13,AR,3,1,1,34.4,-94.37,0.0,0.0,17
4,1950,1950-01-25,IL,2,0,0,41.17,-87.33,0.0,0.0,100


In [8]:
#Renaming columns
df = df.rename(columns={"yr": "Year", "date": "Date", "st": "State",
                        "mag": "Rating", "inj": "Injuries",
                        "fat": "Death", "slat": "Start Lat",
                       "slon": "Start Lon","elat": "End Lat",
                       "elon": "End Lon", "wid": "W in Yds"})
# Display DataFrame
df.head()

Unnamed: 0,Year,Date,State,Rating,Injuries,Death,Start Lat,Start Lon,End Lat,End Lon,W in Yds
0,1950,1950-01-03,IL,3,3,0,39.1,-89.3,39.12,-89.23,130
1,1950,1950-01-03,MO,3,3,0,38.77,-90.22,38.83,-90.03,150
2,1950,1950-01-03,OH,1,1,0,40.88,-84.58,0.0,0.0,10
3,1950,1950-01-13,AR,3,1,1,34.4,-94.37,0.0,0.0,17
4,1950,1950-01-25,IL,2,0,0,41.17,-87.33,0.0,0.0,100


In [9]:
# List of states to include
states_to_include = ["KS", "NE", "IA", "OK", "KY", 
                     "AL", "KY", "MS", "TN", "LA"]

# Filter to only include the states we want to include
states_df = df[df["State"].isin(states_to_include)]

# Filter out Northern Texas
northern_df = states_df[(states_df["Start Lat"] >= 33.5)
                          & (states_df["Start Lat"] <= 36.5)]

# Display DataFrame
states_df.head()

Unnamed: 0,Year,Date,State,Rating,Injuries,Death,Start Lat,Start Lon,End Lat,End Lon,W in Yds
13,1950,1950-02-12,LA,1,0,0,32.6,-91.33,0.0,0.0,33
14,1950,1950-02-12,LA,2,10,5,32.2,-93.58,32.97,-93.17,100
15,1950,1950-02-12,LA,3,25,5,31.63,-93.65,32.55,-93.03,100
16,1950,1950-02-12,LA,4,77,18,31.97,-94.0,33.0,-93.3,100
17,1950,1950-02-12,MS,1,0,0,34.6,-89.12,0.0,0.0,10


In [10]:
# Filter out any data before 1979
year_df = states_df[states_df["Year"] >= 1979]

# Display DataFrame
year_df.head()

Unnamed: 0,Year,Date,State,Rating,Injuries,Death,Start Lat,Start Lon,End Lat,End Lon,W in Yds
19344,1979,1979-02-23,KY,1,0,0,36.62,-88.28,0.0,0.0,30
19348,1979,1979-03-02,AL,1,0,0,30.73,-88.05,0.0,0.0,17
19349,1979,1979-03-02,LA,1,0,0,30.4,-90.98,0.0,0.0,10
19360,1979,1979-03-18,IA,1,0,0,41.38,-95.03,0.0,0.0,30
19361,1979,1979-03-18,KS,1,0,0,39.93,-96.72,39.98,-96.65,150


In [11]:
year_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17056 entries, 19344 to 67555
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Year       17056 non-null  int64         
 1   Date       17056 non-null  datetime64[ns]
 2   State      17056 non-null  object        
 3   Rating     17056 non-null  int64         
 4   Injuries   17056 non-null  int64         
 5   Death      17056 non-null  int64         
 6   Start Lat  17056 non-null  float64       
 7   Start Lon  17056 non-null  float64       
 8   End Lat    17056 non-null  float64       
 9   End Lon    17056 non-null  float64       
 10  W in Yds   17056 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [12]:
# Filter out rows with 0.00 values
filtered_df = year_df[(year_df["End Lat"] != 0.00) & (df["End Lon"] != 0.00)]

# Display DataFrame
filtered_df.head()

  filtered_df = year_df[(year_df["End Lat"] != 0.00) & (df["End Lon"] != 0.00)]


Unnamed: 0,Year,Date,State,Rating,Injuries,Death,Start Lat,Start Lon,End Lat,End Lon,W in Yds
19361,1979,1979-03-18,KS,1,0,0,39.93,-96.72,39.98,-96.65,150
19363,1979,1979-03-18,OK,2,0,0,36.17,-95.78,36.17,-95.75,33
19364,1979,1979-03-18,OK,3,0,0,36.9,-95.93,36.92,-95.8,30
19372,1979,1979-03-19,OK,1,0,0,34.02,-95.63,34.08,-95.48,30
19388,1979,1979-03-29,IA,2,0,0,41.43,-93.5,41.52,-93.35,60


In [13]:
# Write the data to a CSV
filtered_df.to_csv('Tornado_data.csv')

In [14]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13330 entries, 19361 to 67555
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Year       13330 non-null  int64         
 1   Date       13330 non-null  datetime64[ns]
 2   State      13330 non-null  object        
 3   Rating     13330 non-null  int64         
 4   Injuries   13330 non-null  int64         
 5   Death      13330 non-null  int64         
 6   Start Lat  13330 non-null  float64       
 7   Start Lon  13330 non-null  float64       
 8   End Lat    13330 non-null  float64       
 9   End Lon    13330 non-null  float64       
 10  W in Yds   13330 non-null  int64         
dtypes: datetime64[ns](1), float64(4), int64(5), object(1)
memory usage: 1.2+ MB


In [15]:
lat_lon_df = filtered_df[["Start Lat", "End Lat", "Start Lon", "End Lon"]]
lat_lon_values = lat_lon_df.values

lat_lon_values

array([[ 39.93  ,  39.98  , -96.72  , -96.65  ],
       [ 36.17  ,  36.17  , -95.78  , -95.75  ],
       [ 36.9   ,  36.92  , -95.93  , -95.8   ],
       ...,
       [ 34.5167,  34.5115, -85.9429, -85.8375],
       [ 33.0444,  33.0492, -86.7754, -86.5879],
       [ 34.2875,  34.2998, -85.7878, -85.7805]])