#### Notebook to generate the dummy data and saving to CSV

In [48]:
import pandas as pd
import numpy as np

##### First generate the 10 dummy features

In [47]:
dummy_data = {
    'soil_color': np.random.choice(['dark brown', 'reddish', 'light brown'], 1000),
    'soil_ph': np.random.uniform(5.5, 7.5, 1000),
    'soil_n': np.random.uniform(10, 50, 1000),
    'soil_p': np.random.uniform(10, 50, 1000),
    'temp': np.random.uniform(25, 35, 1000),
    'rainfall': np.random.uniform(0, 300, 1000),
    'forecast_temp': np.random.uniform(25, 35, 1000),
    'forecast_rainfall': np.random.uniform(0, 300, 1000),
    'crop_type': np.random.choice(['wheat', 'corn', 'rice'], 1000),
    'plant_health': np.random.choice(['healthy', 'yellowing', 'wilting'], 1000)
}

df = pd.DataFrame(dummy_data)
df.head()

Unnamed: 0,soil_color,soil_ph,soil_n,soil_p,temp,rainfall,forecast_temp,forecast_rainfall,crop_type,plant_health
0,dark brown,5.795332,43.818991,23.462286,28.271618,230.795307,29.702496,195.875237,corn,healthy
1,dark brown,7.075015,20.05303,43.725556,32.992559,75.336117,25.187106,26.551177,corn,wilting
2,reddish,6.79618,23.459571,21.153098,31.58878,214.583842,34.234389,271.540611,wheat,healthy
3,dark brown,7.026346,19.010482,41.065633,32.267196,159.075276,26.590379,188.229598,wheat,healthy
4,reddish,7.047369,10.303615,13.855357,28.453905,243.529369,33.440398,157.20139,wheat,healthy


##### Then generate the target variable


In [40]:
def calculate_optimal_fertilizer(row):

    # Base amount of fertilizer in kg/ha
    base_amount = 100

    # Adjust fertilizer amount based on soil nitrogen levels
    nitrogen_adjustment = (50 - row['soil_n']) * 0.5

    # Adjust fertilizer amount based on soil phosphorus levels
    phosphorus_adjustment = (50 - row['soil_p']) * 0.3

    # Add more fertilizier if ph level is not good  
    ph_adjustment = 10 if row['soil_ph'] < 6 or row['soil_ph'] > 7 else 0

    # Adjust the fertilizer amount based on weather conditions
    weather_multiplier = 0.9 if 25 <= row['temp'] <= 30 and 50 <= row['rainfall'] <= 150 else 1

    # Adjust the fertilizer to use based on crop type 
    crop_multipliers = {'wheat': 1.0, 'corn': 1.2, 'rice': 0.9}

    # Calculate the final fertilizer amount to use 
    final_fertilizer_amount = (base_amount 
                               + nitrogen_adjustment 
                               + phosphorus_adjustment 
                               + ph_adjustment) * weather_multiplier * crop_multipliers[row['crop_type']]
    
    return final_fertilizer_amount

df['optimal_fertilizer_amount'] = df.apply(calculate_optimal_fertilizer, axis=1)

df = df.round(2) 

df.head()

Unnamed: 0,soil_color,soil_ph,soil_n,soil_p,temp,rainfall,forecast_temp,forecast_rainfall,crop_type,plant_health,optimal_fertilizer_amount
0,light brown,6.9,18.69,12.36,29.63,186.32,28.89,61.36,wheat,healthy,126.95
1,reddish,6.3,30.79,41.48,34.6,180.01,29.64,239.27,rice,healthy,100.95
2,dark brown,5.96,27.51,21.46,30.86,63.06,34.07,12.67,wheat,wilting,129.81
3,dark brown,5.66,20.61,19.42,33.1,205.37,25.73,261.76,wheat,healthy,133.87
4,light brown,7.05,21.51,24.78,32.73,144.68,30.69,33.72,rice,yellowing,118.63


#### Introduce missing values and outliers in the dummy data

In [41]:
# Introduce missing values for temperature and rainfall
missing_percentage = 0.003 

for col in ['temp', 'rainfall']:

    num_missing = int(len(df) * missing_percentage)
    missing_indices = np.random.choice(df.index, size=num_missing, replace=False)
    df.loc[missing_indices, col] = np.nan

In [42]:
# Introduce outliers for soil nitrogen and soil phosphorus 
outlier_percentage = 0.001  

for col in ['soil_n', 'soil_p']:

    num_outliers = int(len(df) * outlier_percentage)
    
    outlier_indices = np.random.choice(df.index, size=num_outliers, replace=False)
    
    # Outliers are set at 100 times of the value
    df.loc[outlier_indices, col] = df[col].max() * 100  

In [43]:
# Check if the missing values / outliers were introduced as intended
df.describe()

Unnamed: 0,soil_ph,soil_n,soil_p,temp,rainfall,forecast_temp,forecast_rainfall,optimal_fertilizer_amount
count,1000.0,1000.0,1000.0,997.0,997.0,1000.0,1000.0,1000.0
mean,6.49121,34.73181,35.1482,30.042818,151.611825,29.85155,146.12657,123.78989
std,0.57284,157.380131,157.297741,2.900943,88.198329,2.885652,85.434863,17.724344
min,5.5,10.02,10.14,25.0,0.94,25.01,0.68,83.85
25%,6.01,19.195,20.4725,27.54,74.18,27.26,73.295,110.07
50%,6.48,29.66,30.68,30.01,152.98,29.77,140.775,120.825
75%,7.0025,39.975,40.0525,32.6,226.16,32.41,220.125,136.8175
max,7.5,4993.0,4991.0,34.99,299.53,34.99,299.78,169.93


#### Save dummy data to csv

In [44]:
df.to_csv('../data/dummy_data.csv', index=False)