# 🧹 PART 1: Data Wrangling & Cleaning

In [1]:
# 🧾 1. Import libraries
import pandas as pd

# 📥 2. Load dataset
df = pd.read_csv("C:/Users/manav/Desktop/dataset.csv")
print("Initial shape:", df.shape)

# Preview the data
df.head()

Initial shape: (90, 17)


Unnamed: 0,FlightNumber,Date,BoosterVersion,PayloadMass,Orbit,LaunchSite,Outcome,Flights,GridFins,Reused,Legs,LandingPad,Block,ReusedCount,Serial,Longitude,Latitude
0,1,6/4/2010,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0003,-80.577366,28.561857
1,2,5/22/2012,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0005,-80.577366,28.561857
2,3,3/1/2013,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0007,-80.577366,28.561857
3,4,9/29/2013,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1,0,B1003,-120.610829,34.632093
4,5,12/3/2013,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B1004,-80.577366,28.561857


## 🔍 Step 3: Basic Data Cleaning

We'll now:
- Remove duplicates
- Drop columns with too many missing values
- Fill missing data
- Clean column names


In [2]:
# 🧼 Remove duplicate rows
df.drop_duplicates(inplace=True)

# ❌ Drop columns with over 50% missing values
df.dropna(thresh=len(df)*0.5, axis=1, inplace=True)

# 🧯 Fill missing values using forward fill (customize if needed)
df.fillna(method='ffill', inplace=True)

# 🧽 Clean column names: lowercase, remove spaces
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

# 🧾 Final shape after cleaning
print("Cleaned shape:", df.shape)
df.head()


Cleaned shape: (90, 17)


  df.fillna(method='ffill', inplace=True)


Unnamed: 0,flightnumber,date,boosterversion,payloadmass,orbit,launchsite,outcome,flights,gridfins,reused,legs,landingpad,block,reusedcount,serial,longitude,latitude
0,1,6/4/2010,Falcon 9,6123.547647,LEO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0003,-80.577366,28.561857
1,2,5/22/2012,Falcon 9,525.0,LEO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0005,-80.577366,28.561857
2,3,3/1/2013,Falcon 9,677.0,ISS,CCSFS SLC 40,None None,1,False,False,False,,1,0,B0007,-80.577366,28.561857
3,4,9/29/2013,Falcon 9,500.0,PO,VAFB SLC 4E,False Ocean,1,False,False,False,,1,0,B1003,-120.610829,34.632093
4,5,12/3/2013,Falcon 9,3170.0,GTO,CCSFS SLC 40,None None,1,False,False,False,,1,0,B1004,-80.577366,28.561857


## 💾 Step 4: Save the Cleaned Dataset

We'll save the cleaned version as `cleaned_data.csv` to use in later steps.


In [5]:
# 💽 Save cleaned dataset
df.to_csv("C:/Users/manav/Desktop/cleaned_data.csv", index=False)
print("✅ Cleaned data saved to 'C:/Users/manav/Desktop/cleaned_data.csv'")

✅ Cleaned data saved to 'C:/Users/manav/Desktop/cleaned_data.csv'
