### Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

### Import dataset

In [None]:
df = pd.read_csv('Meteorite_Landings.csv')
df.head()

### Details about the data

In [None]:
df.info()
df.describe()

In [None]:
print("Column Names:", df.columns.tolist())
rows, cols = df.shape

#Before Cleaning
print(f"The DataFrame has {rows} rows and {cols} columns.")

In [None]:
from plotly import io

io.renderers.default = 'iframe'
px.bar(data_frame=df['recclass'].value_counts().to_frame().reset_index().head(n=40), x='recclass', y='count')

## Data Cleaning and Preprocessing

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Drop the 'Unnamed: 10' column, seems all the row's are missing values
df.drop(columns=['Unnamed: 10'], inplace=True)

# Handling missing values
df = df.dropna(subset=['mass (g)', 'year', 'reclat', 'reclong'])

df

In [None]:
df.isnull().sum()
df.info()

In [None]:
# After cleaning
af_rows, af_cols = df.shape

# DataFrame shape before and after cleaning
print("DataFrame Shape Summary")
print("-" * 30)
print(f"Before Cleaning : {rows:,} rows × {cols:,} columns")
print(f"After Cleaning  : {af_rows:,} rows × {af_cols:,} columns")


### Convert 'year' to datetime

In [None]:
# Convert 'year' to datetime
print(f"Before Conversion: {df['year'].dtype}")

# Convert 'year' to datetime
df['year'] = pd.to_datetime(df['year'], format='%Y', errors='coerce')
print(f"After Conversion : {df['year'].dtype}")

### Convert 'fall' column to binary encoding

In [None]:
df['fall'] = df['fall'].str.strip().str.capitalize()
print(df['fall'].unique())

df['fall'] = df['fall'].map({'Fell': 1, 'Found': 0})
#checking values correctness after conversion
print(df['fall'])
print(f"After Conversion : {df['fall'].isnull().sum()}")

In [None]:
# Display the cleaned data
df

## Exploratory Data Analysis (EDA)

In [None]:
print(df['nametype'].value_counts())
print("-" * 30)
print(df['fall'].value_counts())  # 'Fell' vs 'Found'


### Univariate Analysis

#### Fell vs Found Comparison

In [None]:
# Count plot for fall status
sns.countplot(x='fall', data=df)
plt.title("Fell vs Found Meteorites")
plt.show()

#### Histogram for mass (g)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(np.log1p(df['mass (g)']), bins=50, kde=True, color='blue')
plt.xlabel('Log Mass (g)')
plt.ylabel('Frequency')
plt.title('Distribution of Meteorite Mass (Log Scale)')
plt.show()

In [None]:
# Create an interactive map
fig = px.scatter_geo(df,
                     lon='reclong',
                     lat='reclat',
                     color='fall',
                     hover_name='name',
                     hover_data=['mass (g)', 'recclass'],
                     projection='natural earth',
                     title='Meteorite Falls vs Found Map')

# Update layout
fig.update_geos(showcountries=True, showland=True, landcolor="lightgray", showocean=True, oceancolor="lightblue")
fig.update_layout(height=600, margin={"r": 0, "t": 30, "l": 0, "b": 0})

# Show the map
fig.show()

In [None]:
# Visualize the correlation between numeric features
numeric_df = df.select_dtypes(include=[np.number])
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()
print(correlation_matrix)

In [None]:
# Convert 'year' to datetime and extract year as integer
df['year'] = pd.to_datetime(df['year'], errors='coerce')
df['year_int'] = df['year'].dt.year

# Filter using the integer year
df_recent = df[(df['year_int'] > 1970) & (df['year_int'] < 2025)]

# Now plot with Plotly
import plotly.express as px

fig = px.scatter_mapbox(
    data_frame=df_recent,
    lat='reclat',
    lon='reclong',
    color='year_int',              # or use 'year' if you prefer a datetime color scale
    size='mass (g)',
    size_max=15,
    hover_name='name',
    hover_data={
        'recclass': True,
        'mass (g)': True,
        'year_int': True
    },
    mapbox_style='carto-darkmatter',
    zoom=1,
    center={'lat': 0, 'lon': 0},
    title="Meteorite Falls (1970 - 2024) Visualized on Map",
    height=800,
    color_continuous_scale='Viridis'
)

fig.show()

In [None]:
print("Hello")

In [None]:
falls_over_time = df.groupby(df['year'].dt.year).size()

plt.figure(figsize=(12, 6))
plt.plot(falls_over_time.index, falls_over_time.values, marker='o')
plt.title('Number of Meteorite Falls Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Meteorites')
# plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.boxplot(x='fall', y='mass (g)', data=df)
plt.title('Mass of Meteorites by Fall Type')
plt.xlabel('Fall Type')
plt.ylabel('Mass in grams')
plt.tight_layout()
plt.show()

## Question 1:What is the distribution of meteorite classifications (recclass) and how does it differ between observed falls (Fell) and discovered finds (Found)? (descriptive)


### Preprocess for the visualizations

In [None]:
# Minimum total occurrences to be included in a class
min_count = 10  

recclass_fall_counts = df.groupby(['recclass', 'fall']).size().reset_index(name='count')
recclass_pivot = recclass_fall_counts.pivot(index='recclass', columns='fall', values='count').fillna(0)
recclass_pivot.columns = ['Found', 'Fell']
recclass_pivot['total'] = recclass_pivot.sum(axis=1)

# Filter out classes less than min_count as the rare classes
recclass_pivot = recclass_pivot[recclass_pivot['total'] >= min_count].sort_values('total', ascending=False)

# Add a normalized metric
recclass_pivot['fell_ratio'] = recclass_pivot['Fell'] / recclass_pivot['total']  
recclass_pivot['found_ratio'] = 1 - recclass_pivot['fell_ratio']

### Visualize the top classes

In [None]:
# Dual-axis plot: counts + ratios
fig, ax1 = plt.subplots(figsize=(14,7))
ax2 = ax1.twinx()
top20 = recclass_pivot.head(20)
# Absolute counts (stacked bars)
bars = top20[['Fell','Found']].plot.bar(stacked=True, ax=ax1, color=['#ff6b6b','#4ecdc4'], label=['Fell', 'Found'])
ax1.set_ylabel('Total Count', color='black')

# Fall ratio (line plot)
line, = ax2.plot(top20['fell_ratio'], marker='o', color='#ff0000', label='Fell Ratio (Fell/Total)')
ax2.set_ylabel('Fell Ratio (Fell/Total)', color='#ff0000')
ax2.set_ylim(0, 1)

# Combine legends from both axes
lines_labels = ax1.get_legend_handles_labels()
line2_labels = ([line], [line.get_label()])
handles = lines_labels[0] + line2_labels[0]
labels = lines_labels[1] + line2_labels[1]

# Set legend position
ax1.legend(handles, labels, loc='upper left', bbox_to_anchor=(0, 1.15), ncol=3, fontsize=10)

plt.title('Top Meteorite Classes: Counts vs Fall Ratios', fontsize=16)
ax1.tick_params(axis='x', rotation=80)
plt.tight_layout()
plt.show()


### Showing the classes with bias

In [None]:
# Show the biased classes

print("\nClasses Most Biased Toward Falls:")
display(recclass_pivot.sort_values('fell_ratio', ascending=False).head(5)[['Fell','total','fell_ratio']])

print("\nClasses Most Biased Toward Finds:")
display(recclass_pivot.sort_values('found_ratio', ascending=False).head(5)[['Found','total','found_ratio']])

### Interactive treemap for showing every recclass in 1 go

In [None]:
# Treemap for rare classes (complementary insight)

rare_classes = recclass_fall_counts[~recclass_fall_counts['recclass'].isin(top20.index)]
px.treemap(rare_classes, path=['recclass'], values='count', 
           title='Rare Meteorite Classifications (Total < 10)')

#ps: i kinda like how dumb this look.

## Question 2: Investigate whether meteorietes with a higher mass (grams) exhibit distinct classifications patterns or geographic clustering compared to metorites with lower mass (Exploratory)

explore relationships between:
- mass
- classfication (recclass)
- geographic location (reclat, reclong)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Group by recclass
class_stats = df.groupby('recclass')['mass (g)'].agg(['count', 'mean', 'median', 'max']).sort_values(by='mean', ascending=False)

# Plot top 10 classes by average mass
class_stats.head(10).plot(kind='bar', y='mean', title='Top 10 Meteorite Classes by Average Mass')
plt.ylabel('Average Mass (g)')
plt.tight_layout()
plt.show()

# Boxplot: Mass distribution per classification
top_classes = df['recclass'].value_counts().head(10).index
sns.boxplot(x='recclass', y='mass (g)', data=df[df['recclass'].isin(top_classes)])
plt.xticks(rotation=45)
plt.yscale('log')  # because mass likely has a long tail
plt.title('Mass Distribution by Recclass')
plt.tight_layout()
plt.show()

In [None]:
# import geopandas as gpd
# import matplotlib.pyplot as plt

# # Filter for valid geolocations and mass
# df_geo = df[df['reclat'].notna() & df['reclong'].notna() & df['mass (g)'].notna()]

# # Scatter plot on map
# # world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# url = "https://naturalearth.s3.amazonaws.com/110m_cultural/ne_110m_admin_0_countries.zip"
# world = gpd.read_file(url)
# fig, ax = plt.subplots(figsize=(15, 10))
# world.plot(ax=ax, color='lightgray')

# # Normalize mass for better visualization
# sizes = (df_geo['mass (g)'] / df_geo['mass (g)'].max()) * 100

# plt.scatter(df_geo['reclong'], df_geo['reclat'], s=sizes, c=df_geo['mass (g)'], cmap='viridis', alpha=0.5)
# plt.colorbar(label='Mass (g)')
# plt.title('Meteorite Landings: Mass and Geographic Distribution')
# plt.xlabel('Longitude')
# plt.ylabel('Latitude')
# plt.tight_layout()
# plt.show()

In [None]:
mass_bins = [0, 1000, 10000, 50000, df['mass (g)'].max()]
bin_labels = ['Small', 'Medium', 'Large', 'Very Large']

# Im creating a new column here based on the bins
df['mass_category'] = pd.cut(df['mass (g)'], bins=mass_bins, labels=bin_labels, include_lowest=True) 
#df['mass_category'].head(20) #checking values

fig = px.scatter_mapbox(
    df,
    lat='reclat',
    lon='reclong',
    color='mass_category', 
    size='mass (g)',
    size_max=20,
    mapbox_style='carto-positron',
    title='Meteorite Locations by Mass Category',
    zoom=1,
    height=700,
    hover_name='name',
    hover_data=['mass (g)', 'recclass'],
    category_orders={
        'mass_category': ['Small', 'Medium', 'Large', 'Very Large']  
    }
)
fig.show()

In [None]:
top_classes = df['recclass'].value_counts().nlargest(10).index
df_top = df[df['recclass'].isin(top_classes)]

plt.figure(figsize=(12, 6))
sns.countplot(data=df_top, x='recclass', hue='mass_category')  # or mass_quartile
plt.title('Meteorite Classification by Mass Category')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## Question 4: How accurately can classification models predict whether a meteorite fell or was found, using its mass, classification type, and geographic coordinates as input features?

### Predicting Meteorite Origin: Classification Models

#### Objective
**Question**: How accurately can classification models predict whether a meteorite **fell** or was **found**, using its:
- `mass (g)`
- `classification type` (`recclass`)
- `geographic coordinates` (`reclat`, `reclong`)

The goal is to apply and compare different classification algorithms to determine which model best predicts the fall status (`fall` column) of meteorites.

We will explore the following models:
1. **Logistic Regression**
2. **Random Forest Classifier**
3. **Support Vector Machine (SVM)**

Each model will be evaluated using:
- Confusion Matrix
- Classification Report (Precision, Recall, F1-score)
- Visualizations


### Machine Learning Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,classification_report


# Drop irrelevant columns
df = df[['mass (g)', 'recclass', 'reclat', 'reclong', 'fall']].dropna()

# Encode categorical feature
df['recclass'] = LabelEncoder().fit_transform(df['recclass'])

# Features and target
X = df[['mass (g)', 'recclass', 'reclat', 'reclong']]
y = df['fall']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


#### Logistic Regression

In [None]:
# Train Logistic Regression model
lr_model = LogisticRegression()
lr_model.fit(X_train_scaled, y_train)

# Predict
lr_pred = lr_model.predict(X_test_scaled)

# Save classification report
lr_report = classification_report(y_test, lr_pred, target_names=["Found", "Fell"], output_dict=True)

# Confusion Matrix
lr_cm = confusion_matrix(y_test, lr_pred)
lr_cm_df = pd.DataFrame(lr_cm, index=['Actual: Found (0)', 'Actual: Fell (1)'],
                                   columns=['Predicted: Found (0)', 'Predicted: Fell (1)'])

# Print results
print("📘 Logistic Regression Confusion Matrix:")
print(lr_cm_df)
print("\n📊 Classification Report:")
print(classification_report(y_test, lr_pred, target_names=["Found", "Fell"]))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Train Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)

# Predict
rf_pred = rf_model.predict(X_test_scaled)

# Save classification report for later comparison
rf_report = classification_report(y_test, rf_pred, target_names=["Found", "Fell"], output_dict=True)

# Confusion Matrix
rf_cm = confusion_matrix(y_test, rf_pred)
rf_cm_df = pd.DataFrame(rf_cm, index=['Actual: Found (0)', 'Actual: Fell (1)'],
                                  columns=['Predicted: Found (0)', 'Predicted: Fell (1)'])

# Print results
print("🌳 Random Forest Confusion Matrix:")
print(rf_cm_df)
print("\n📊 Classification Report:")
print(classification_report(y_test, rf_pred, target_names=["Found", "Fell"]))


### SVM

In [None]:
# Train SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Predict
svm_pred = svm_model.predict(X_test_scaled)

# Save classification report
svm_report = classification_report(y_test, svm_pred, target_names=["Found", "Fell"], output_dict=True)

# Confusion Matrix
svm_cm = confusion_matrix(y_test, svm_pred)
svm_cm_df = pd.DataFrame(svm_cm, index=['Actual: Found (0)', 'Actual: Fell (1)'],
                                     columns=['Predicted: Found (0)', 'Predicted: Fell (1)'])

# Print results
print("🔲 SVM Confusion Matrix:")
print(svm_cm_df)
print("\n📊 Classification Report:")
print(classification_report(y_test, svm_pred, target_names=["Found", "Fell"]))

### Visual Comparison of Confusion Matrices

Below are the confusion matrices for the three models. Logistic Regression, Random Forest, and SVM. These matrices visually represent how well each model performed in classifying the meteorites as "Found" or "Fell".

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Logistic Regression
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title("Logistic Regression")
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")

# Random Forest
sns.heatmap(rf_cm_df, annot=True, fmt='d', cmap='Greens', ax=axes[1])
axes[1].set_title("Random Forest")
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("")

# SVM
sns.heatmap(svm_cm_df, annot=True, fmt='d', cmap='Purples', ax=axes[2])
axes[2].set_title("SVM")
axes[2].set_xlabel("Predicted")
axes[2].set_ylabel("")

plt.tight_layout()
plt.show()

In [None]:
comparison_df = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest", "SVM"],
    "Accuracy": [lr_report['accuracy'], rf_report['accuracy'], svm_report['accuracy']],
    "Precision (Fell)": [lr_report['Fell']['precision'], rf_report['Fell']['precision'], svm_report['Fell']['precision']],
    "Recall (Fell)": [lr_report['Fell']['recall'], rf_report['Fell']['recall'], svm_report['Fell']['recall']],
    "F1-Score (Fell)": [lr_report['Fell']['f1-score'], rf_report['Fell']['f1-score'], svm_report['Fell']['f1-score']]
})

comparison_df.set_index("Model", inplace=True)
comparison_df = comparison_df.round(3)
display(comparison_df)

### Interpretation:

- **Logistic Regression** shows high overall accuracy but fails to capture "Fell" cases — very low recall and F1-score.
- **SVM** improves on logistic regression but still underperforms in recall for the "Fell" class.
- **Random Forest** clearly outperforms the others in all metrics related to the minority class ("Fell").

---

### Final Verdict:
**Random Forest Classifier** is the best model for this classification task. It balances precision and recall effectively, especially for the underrepresented class, and delivers the most robust results.
