# SVM Model

In [202]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

## 1. Reading Dataset & Exploring

In [205]:
# Read dataset
series = pd.read_csv('extinction-threat-to-indigenous-land-species.csv')

# Basic Info
series.info()
print()
print(series.isnull().sum())

# First 2000 rows
series_small = series.iloc[:2000]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11861 entries, 0 to 11860
Data columns (total 35 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   report_name                 11861 non-null  object 
 1   common_name                 8734 non-null   object 
 2   m_ori_name                  700 non-null    object 
 3   species_name                11861 non-null  object 
 4   category                    11861 non-null  object 
 5   subcategory                 11861 non-null  object 
 6   status_change               9469 non-null   object 
 7   subcategory_change          9420 non-null   object 
 8   assessment_id               11861 non-null  int64  
 9   report_id                   11861 non-null  int64  
 10  species_id                  11861 non-null  int64  
 11  genus                       11861 non-null  object 
 12  family                      11861 non-null  object 
 13  order                       118

## 2. Preprocessing - Handling Missing Data

In [208]:
# Selecting important columns
selected_cols = [
    'order', 'class', 'domain',
    'native_resident_living', 'conservation_dependent', 'population_trend'
]

X = series_small[selected_cols]
y = series_small['category'] # target variable

# Handle missing values
X = X.fillna('Unknown')

### Encoding 

In [227]:
# Encode categorical columns
for col in X.select_dtypes(include='object').columns:
    X[col] = LabelEncoder().fit_transform(X[col].astype(str))

# Lable Encode
y_encoded = LabelEncoder().fit_transform(y)

# New Modified Data

## 3. Splitting Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

## 4. Scaling Features

In [None]:
# Standard Scaling for SVM - best option
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Tuning Parameters Using Gird Search

In [None]:
param_grid = {
    'C': [10, 20, 50],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto']
}
grid = GridSearchCV(SVC(), param_grid, cv=5, verbose=1, n_jobs=-1)
grid.fit(X_train_scaled, y_train)

print(f"Best Parameters: {grid.best_params_}")

## 6. Model Prediction

In [297]:
y_pred = grid.predict(X_test_scaled)
final_accuracy = accuracy_score(y_test, y_pred)
print(f"Final Accuracy: {final_accuracy:.4f}")

Final Accuracy: 0.6675
