In [11]:
import pandas as pd
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Number of samples in the dataset
n_samples = 100000

# Generate the synthetic data
data = pd.DataFrame({
    'Altitude': np.random.uniform(1200, 3400, n_samples),  # Uniform distribution
    'Distance_from_Human_Paths': np.random.exponential(500, n_samples),  # Exponential distribution
    'Livestock_Density': np.random.gamma(2, 0.5, n_samples),  # Gamma distribution for skewed data
    'Vegetation_Diversity_Index': np.random.uniform(0, 1, n_samples),  # Uniform distribution
    'Water_Source_Availability': np.random.choice([0, 1], n_samples, p=[0.3, 0.7]),  # Binary, with more 1s
    'Human_Disturbance_Index': np.random.uniform(0, 1, n_samples),  # Uniform distribution
    'Slope': np.random.uniform(0, 30, n_samples),  # Uniform distribution
    'Annual_Rainfall': np.random.normal(1500, 250, n_samples)  # Normal distribution
})

# Add noise function
def add_noise(factor, size):
    return np.random.normal(1, factor, size)

# Correlation 1: Altitude and Bamboo Coverage (with noise)
data['Bamboo_Coverage'] = np.interp(data['Altitude'], [1200, 3400], [0, 100])
data['Bamboo_Coverage'] *= add_noise(0.1, n_samples)  # Adding 10% noise

# Correlation 2: Livestock Density and Vegetation Diversity Index (with noise)
data['Vegetation_Diversity_Index'] *= np.exp(-data['Livestock_Density'])
data['Vegetation_Diversity_Index'] *= add_noise(0.05, n_samples)  # Adding 5% noise

# Correlation 3: Water Source Availability and Annual Rainfall (with noise)
data['Water_Source_Availability'] = np.where(data['Annual_Rainfall'] > data['Annual_Rainfall'].mean(), 1, 0)
data['Water_Source_Availability'] = np.where(np.random.rand(n_samples) < 0.1, 1 - data['Water_Source_Availability'], data['Water_Source_Availability'])  # Inverting 10% to add noise

# Correlation 4: Distance from Human Paths and Human Disturbance Index (with noise)
data['Human_Disturbance_Index'] = np.interp(data['Distance_from_Human_Paths'], [0, max(data['Distance_from_Human_Paths'])], [1, 0])
data['Human_Disturbance_Index'] *= add_noise(0.1, n_samples)  # Adding 10% noise

# Correlation 5: Slope and Bamboo Coverage (with noise)
data['Bamboo_Coverage'] *= np.where(data['Slope'] < 20, 1, 0.5)  # Reduce bamboo coverage on steep slopes
data['Bamboo_Coverage'] *= add_noise(0.1, n_samples)  # Adding 10% noise

# Define the suitability based on simplified conditions (from the previous example)
# Adjust thresholds if needed to accommodate correlations
data['Suitable'] = np.where(
    (data['Altitude'] > 1500) & 
    (data['Altitude'] < 3000) &
    (data['Distance_from_Human_Paths'] > 500) &
    (data['Livestock_Density'] < 5) &
    (data['Bamboo_Coverage'] > 20) &
    (data['Vegetation_Diversity_Index'] > 0.3) &
    (data['Water_Source_Availability'] == 1) &
    (data['Human_Disturbance_Index'] < 0.5) &
    (data['Slope'] < 20) &
    (data['Annual_Rainfall'] > 1000),
    1, 0)

# Mean Annual Temperature is derived with some noise, considering its correlation with Altitude
data['Mean_Annual_Temperature'] = 20 - (data['Altitude'] / 200)  # Simplified linear relation
data['Mean_Annual_Temperature'] += np.random.normal(0, 1, n_samples)  # Adding noise

# Output the first few rows of the dataset
print(data.head())

# Save the dataset to


      Altitude  Distance_from_Human_Paths  Livestock_Density  \
0  2023.988261                 434.678578           0.634095   
1  3291.571474                 374.299977           0.132676   
2  2810.386672                 216.189749           0.477984   
3  2517.048665                 339.831903           0.386706   
4  1543.241009                 227.141237           0.654869   

   Vegetation_Diversity_Index  Water_Source_Availability  \
0                    0.319653                          0   
1                    0.604097                          1   
2                    0.157106                          0   
3                    0.560503                          1   
4                    0.349722                          1   

   Human_Disturbance_Index      Slope  Annual_Rainfall  Bamboo_Coverage  \
0                 1.025552   1.902428      1364.685583        30.270545   
1                 1.032245  16.484248      1616.727901        71.382519   
2                 0.963946  2

In [12]:
from sklearn.cluster import KMeans

# Assuming `data` is your DataFrame with all the features but without the 'Suitable' column

# Normalize the data for clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Use KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# Add the cluster information to the original data
data['Cluster'] = clusters

# Check the centroid values to infer which cluster might be 'suitable'
print(kmeans.cluster_centers_)

# Assume cluster 1 is 'suitable' based on centroid analysis
data['Suitable'] = (data['Cluster'] == 1).astype(int)

# Drop the 'Cluster' column if it's no longer needed
data.drop('Cluster', axis=1, inplace=True)

# Now, the 'Suitable' column is derived from clustering, which may yield a more nuanced distribution
print(data['Suitable'].value_counts())


[[ 8.66623982e-01  1.06349065e-03  7.57121241e-04  2.45211983e-03
  -5.34520953e-03  2.06043447e-03 -9.66106843e-02 -1.71885032e-03
   7.87332850e-01  7.38661630e-03 -8.45277735e-01]
 [-8.35174742e-01 -1.02489724e-03 -7.29645787e-04 -2.36313394e-03
   5.15123523e-03 -1.98566259e-03  9.31047432e-02  1.65647432e-03
  -7.58761035e-01 -7.11856063e-03  8.14603137e-01]]
1    50941
0    49059
Name: Suitable, dtype: int64


In [13]:
data.Suitable.value_counts()

1    50941
0    49059
Name: Suitable, dtype: int64

In [14]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Assuming `data` is your DataFrame with all the features but without the 'Suitable' column

# Normalize the data for clustering
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Use KMeans clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(data_scaled)

# Add the cluster information to the original data
data['Cluster'] = clusters

# Identify the centroids for each cluster
centroids = kmeans.cluster_centers_

# Calculate the distance of each point to the centroids
distances = kmeans.transform(data_scaled)

# Get the distances to the "farthest" centroid (assuming cluster 0 is 'unsuitable', cluster 1 is 'suitable')
data['Distance_to_Unsuitable'] = distances[:, 0]
data['Distance_to_Suitable'] = distances[:, 1]

# Determine the 30th percentile threshold for the distance to the 'suitable' centroid
threshold = np.percentile(data['Distance_to_Suitable'], 70)  # since we want the closest 30%

# Label the data based on the threshold
data['Suitable'] = (data['Distance_to_Suitable'] <= threshold).astype(int)

# Now, 30% of your data should be labeled as suitable
print(data['Suitable'].value_counts(normalize=True))  # Check the proportion of '1's and '0's

# Drop the columns not needed anymore
data.drop(['Cluster', 'Distance_to_Unsuitable', 'Distance_to_Suitable'], axis=1, inplace=True)

1    0.7
0    0.3
Name: Suitable, dtype: float64


In [15]:
data.Suitable.value_counts()

1    70000
0    30000
Name: Suitable, dtype: int64

In [16]:
data

Unnamed: 0,Altitude,Distance_from_Human_Paths,Livestock_Density,Vegetation_Diversity_Index,Water_Source_Availability,Human_Disturbance_Index,Slope,Annual_Rainfall,Bamboo_Coverage,Suitable,Mean_Annual_Temperature
0,2023.988261,434.678578,0.634095,0.319653,0,1.025552,1.902428,1364.685583,30.270545,1,9.912647
1,3291.571474,374.299977,0.132676,0.604097,1,1.032245,16.484248,1616.727901,71.382519,1,1.701986
2,2810.386672,216.189749,0.477984,0.157106,0,0.963946,29.466434,1373.229653,31.969303,1,6.258988
3,2517.048665,339.831903,0.386706,0.560503,1,1.142016,9.966878,1747.708413,50.076009,1,7.423174
4,1543.241009,227.141237,0.654869,0.349722,1,1.050573,10.647234,1500.073131,17.449099,0,12.331676
...,...,...,...,...,...,...,...,...,...,...,...
99995,2943.070627,486.628948,1.059066,0.097482,0,1.020356,27.083635,1122.526593,40.315010,1,2.828343
99996,2914.356369,496.073871,1.447014,0.163341,1,0.932492,10.520047,1598.369927,80.260443,1,4.820152
99997,2683.797496,106.722664,0.262224,0.680997,1,1.095725,11.544332,1523.291354,68.622167,1,6.417745
99998,2298.783938,529.334414,0.840479,0.315498,1,0.879566,23.171634,1468.113298,22.619058,1,5.281059


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Prepare the data
# Features matrix 'X' and target variable 'y'
X = data.drop('Suitable', axis=1)
y = data['Suitable']

# Step 2: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 3: Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 4: Initialize and train the Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Step 5: Predictions and evaluation
y_pred = log_reg.predict(X_test_scaled)

# Model evaluation
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.90

Confusion Matrix:
 [[ 7459  1482]
 [ 1478 19581]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83      8941
           1       0.93      0.93      0.93     21059

    accuracy                           0.90     30000
   macro avg       0.88      0.88      0.88     30000
weighted avg       0.90      0.90      0.90     30000

