<a href="https://colab.research.google.com/github/fidaasma/groundwater-management-analysis-ai-ml/blob/main/week1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# STEP 1: Upload your CSV file
from google.colab import files
import pandas as pd
import numpy as np

uploaded = files.upload()
filename = list(uploaded.keys())[0]
df = pd.read_csv(filename)

print("🔹 Original Data Shape:", df.shape)
print(df.head())

# STEP 2: Handle missing values
for col in df.columns:
    if df[col].dtype == "object":   # categorical
        df[col] = df[col].fillna(df[col].mode()[0])
    else:   # numeric
        df[col] = df[col].fillna(df[col].mean())

print("\n✅ Missing values handled")

# STEP 3: Drop columns with no variance
nunique = df.nunique()
drop_cols = nunique[nunique == 1].index.tolist()
df = df.drop(columns=drop_cols)
print("\n🚫 Dropped columns with no variance:", drop_cols)

# STEP 4: Convert Month names to numbers and cyclic encoding
if "Month" in df.columns:
    month_map = {
        "Jan":1,"Feb":2,"Mar":3,"Apr":4,"May":5,"Jun":6,
        "Jul":7,"Aug":8,"Sep":9,"Oct":10,"Nov":11,"Dec":12
    }
    df["Month_num"] = df["Month"].map(month_map)
    df["Month_sin"] = np.sin(2 * np.pi * df["Month_num"]/12)
    df["Month_cos"] = np.cos(2 * np.pi * df["Month_num"]/12)
    df = df.drop(columns=["Month","Month_num"])
    print("\n🔄 Added cyclic features for Month")

# STEP 5: One-Hot Encoding for categorical features
cat_cols = ["Block","Soil_Type"]
for col in cat_cols:
    if col in df.columns:
        df = pd.get_dummies(df, columns=[col], drop_first=True)

print("\n🟢 After Encoding Shape:", df.shape)
print(df.head())

# STEP 6: Scale numerical features
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
num_cols = df.select_dtypes(include=[np.number]).columns
df[num_cols] = scaler.fit_transform(df[num_cols])

print("\n📊 Final Preprocessed Data (first 5 rows):")
print(df.head())

# STEP 7: Save the cleaned dataset
df.to_csv("cleaned_data.csv", index=False)
print("\n✅ Cleaned dataset saved as 'cleaned_data.csv'")


Saving alappuzha_groundwater_raw_dataset.csv to alappuzha_groundwater_raw_dataset.csv
🔹 Original Data Shape: (864, 17)
   Year Month  Rainfall_mm  Max_Temp_C  Min_Temp_C         Block  \
0  2006   Jan         11.7        30.7        23.9   Ambalapuzha   
1  2006   Jan         11.7        30.7        23.9         Aryad   
2  2006   Jan         11.7        30.7        23.9  Bharanikkavu   
3  2006   Jan         11.7        30.7        23.9   Champakulam   
4  2006   Jan         11.7        30.7        23.9    Chengannur   

   DecadalAvg_Pre_m  DecadalAvg_Post_m          Soil_Type  pH  EC_µS_cm  \
0              2.20               1.10   Coastal Alluvium NaN       NaN   
1              1.62               0.88   Coastal Alluvium NaN       NaN   
2              7.91               6.27     Lateritic Soil NaN       NaN   
3              1.50               0.44  Riverine Alluvium NaN       NaN   
4              4.36               2.78     Lateritic Soil NaN       NaN   

   Ca_mgL  Mg_mgL  Cl

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Copy your cleaned dataset to Drive
!cp cleaned_data.csv /content/drive/MyDrive/cleaned_data.csv
print("✅ File uploaded to Google Drive: /MyDrive/cleaned_data.csv")



Mounted at /content/drive
✅ File uploaded to Google Drive: /MyDrive/cleaned_data.csv
