In [1]:
# 1. Import libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# 2. Load the dataset
df = pd.read_csv("../data/wildlife_mock.csv")
print("Dataset loaded successfully!")
df.head()

Dataset loaded successfully!


Unnamed: 0,State,Month,Year,Incident,Animal
0,Karnataka,10,2020,Trap/Accident,Elephant
1,Maharashtra,4,2020,Conflict with humans,Bear
2,Rajasthan,8,2019,Trap/Accident,Snake
3,Uttar Pradesh,8,2018,Trap/Accident,Rhino
4,Uttar Pradesh,11,2022,Illegal trade,Monkey


In [3]:
# 3. Encode categorical features for ML/clustering
le_state = LabelEncoder()
le_incident = LabelEncoder()
le_animal = LabelEncoder()

df['State_encoded'] = le_state.fit_transform(df['State'])
df['Incident_encoded'] = le_incident.fit_transform(df['Incident'])
df['Animal_encoded'] = le_animal.fit_transform(df['Animal'])

print("\nEncoded columns added: State_encoded, Incident_encoded, Animal_encoded")


Encoded columns added: State_encoded, Incident_encoded, Animal_encoded


In [4]:
# 4. Optional: Create time-based features
# Example: Quarter of the year
df['Quarter'] = ((df['Month'] - 1) // 3) + 1

# Example: Season (Northern India)
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Monsoon'
    else:
        return 'Autumn'

df['Season'] = df['Month'].apply(get_season)
df['Season_encoded'] = LabelEncoder().fit_transform(df['Season'])

In [5]:
# 5. Scaling numeric features for clustering
scaler = StandardScaler()
df[['Year_scaled', 'Month_scaled']] = scaler.fit_transform(df[['Year', 'Month']])

In [6]:
# 6. Save processed dataset
df.to_csv("../data/processed/wildlife_processed.csv", index=False)
print("Processed dataset saved as 'wildlife_processed.csv' in data/processed/")

Processed dataset saved as 'wildlife_processed.csv' in data/processed/
