In [106]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
dataset = 'Meteorite Landing.csv'

# Step 1: The Setup

In [107]:
df = pd.read_csv(dataset)

In [122]:
df.shape

(31928, 10)

Inspect the first few rows

In [108]:
df.head()

Unnamed: 0,Name,ID,NameType,Classification,Mass,Fall,Year,Coordinates
0,Aachen,1,Valid,L5,"Quantity[21, ""Grams""]",Fell,"DateObject[{1880}, ""Year"", ""Gregorian"", -5.]","GeoPosition[{50.775, 6.08333}]"
1,Aarhus,2,Valid,H6,"Quantity[720, ""Grams""]",Fell,"DateObject[{1951}, ""Year"", ""Gregorian"", -5.]","GeoPosition[{56.18333, 10.23333}]"
2,Abee,6,Valid,EH4,"Quantity[107000, ""Grams""]",Fell,"DateObject[{1952}, ""Year"", ""Gregorian"", -5.]","GeoPosition[{54.21667, -113.}]"
3,Acapulco,10,Valid,Acapulcoite,"Quantity[1914, ""Grams""]",Fell,"DateObject[{1976}, ""Year"", ""Gregorian"", -5.]","GeoPosition[{16.88333, -99.9}]"
4,Achiras,370,Valid,L6,"Quantity[780, ""Grams""]",Fell,"DateObject[{1902}, ""Year"", ""Gregorian"", -5.]","GeoPosition[{-33.16667, -64.95}]"


Get a summary of the data

In [109]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Name            45716 non-null  object
 1   ID              45716 non-null  int64 
 2   NameType        45716 non-null  object
 3   Classification  45716 non-null  object
 4   Mass            45716 non-null  object
 5   Fall            45716 non-null  object
 6   Year            45716 non-null  object
 7   Coordinates     45716 non-null  object
dtypes: int64(1), object(7)
memory usage: 2.8+ MB


Handle Missing Values

In [110]:
df.isna().sum()

Name              0
ID                0
NameType          0
Classification    0
Mass              0
Fall              0
Year              0
Coordinates       0
dtype: int64

**Get latitude and longitude from coordinates**
and
**Get year and mass in standard format using regex**

In [111]:
df[['lat', 'long']] = df['Coordinates'].str.extract(r'\{([\-0-9\.]+),\s*([\-0-9\.]+)\}')
df['Year'] = df['Year'].str.extract(r'(\d{4})')
df['Mass'] = df['Mass'].str.extract(r'Quantity\[(\-?\d+\.?\d*),').astype(float)
df.head()

Unnamed: 0,Name,ID,NameType,Classification,Mass,Fall,Year,Coordinates,lat,long
0,Aachen,1,Valid,L5,21.0,Fell,1880,"GeoPosition[{50.775, 6.08333}]",50.775,6.08333
1,Aarhus,2,Valid,H6,720.0,Fell,1951,"GeoPosition[{56.18333, 10.23333}]",56.18333,10.23333
2,Abee,6,Valid,EH4,107000.0,Fell,1952,"GeoPosition[{54.21667, -113.}]",54.21667,-113.0
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976,"GeoPosition[{16.88333, -99.9}]",16.88333,-99.9
4,Achiras,370,Valid,L6,780.0,Fell,1902,"GeoPosition[{-33.16667, -64.95}]",-33.16667,-64.95


Generate descriptive statistics for year, lat, and long

In [112]:
df[['Year', 'lat', 'long']].describe()

Unnamed: 0,Year,lat,long
count,45422,32187.0,32187.0
unique,262,12738.0,14639.0
top,2003,-71.5,35.66667
freq,3323,4761.0,4985.0


Generate descriptive statistics for mass

In [113]:
df[['Mass']].describe()

Unnamed: 0,Mass
count,45585.0
mean,13278.08
std,574988.9
min,0.0
25%,7.2
50%,32.6
75%,202.6
max,60000000.0


# Step 2: The Cleanup Crew

Finding null values

In [117]:
df.isna()

Unnamed: 0,Name,ID,NameType,Classification,Mass,Fall,Year,Coordinates,lat,long
0,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
45711,False,False,False,False,False,False,False,False,False,False
45712,False,False,False,False,False,False,False,False,False,False
45713,False,False,False,False,False,False,False,False,False,False
45714,False,False,False,False,False,False,False,False,False,False


Finding total number of null values in each columns

In [114]:
df.isna().sum()

Name                  0
ID                    0
NameType              0
Classification        0
Mass                131
Fall                  0
Year                294
Coordinates           0
lat               13529
long              13529
dtype: int64

Dropping columns with null values

In [125]:
df.dropna(subset = ['Mass', 'Year', 'lat', 'long'], inplace=True)
df.isna().sum()

Name              0
ID                0
NameType          0
Classification    0
Mass              0
Fall              0
Year              0
Coordinates       0
lat               0
long              0
dtype: int64

In [126]:
df.shape

(31928, 10)

Filter Out Erroneous Data
