<a href="https://colab.research.google.com/github/gnatnib/crop_yield_prediction/blob/main/crop_yield_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Crop Yield Prediction - Regresi Interpolasi**
**Anggota Kelompok:**


*   Bintang Syafrian Rizal - 24060122120031
*   Awang Pratama Mulya    - 24060122120039
*   Irfan Mursyid



In [None]:
#import dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [None]:
df = pd.read_csv('yield_df.csv')

In [None]:
df.head()

# **Data Cleaning**

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
numerical_df = df.select_dtypes(include=np.number)
correlation_matrix = numerical_df.corr()

print(correlation_matrix)

# **Data Visualization**

In [None]:
len(df['Area'].unique())

In [None]:
plt.figure(figsize=(20,4))
g1 = sns.countplot(x=df['Area'], hue=df['Area'], legend=False)
g1.set_title('Distribution of Area')

g1.set_xticklabels(g1.get_xticklabels(), rotation=90)
plt.show()

In [None]:
plt.figure(figsize=(8,4))
g2 = sns.countplot(y=df['Item'], hue=df['Item'], legend=False)
g2.set_title('Distribution of Item')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(18, 3))

g1=sns.histplot(df["average_rain_fall_mm_per_year"], kde=True, ax=axes[0]) 
g2=sns.histplot(df["pesticides_tonnes"], kde=True, ax=axes[1], label='Data') 
g3=sns.histplot(df["avg_temp"], kde=True, ax=axes[2], label='Data') 


g1.set_title("Distribution of average_rain_fall_mm_per_year")
g2.set_title("Distribution of pesticides_tonnes")
g3.set_title("Distribution of avg_temp")

plt.show()

In [None]:
plt.figure(figsize=(4, 4))
g=sns.histplot(df["hg/ha_yield"], kde=True, label='Data') 
g.set_title("Distribution of target column: hg/ha_yield")

plt.show()

In [None]:
#Correlation Matrix
corr_matrix = df.drop(['Area', 'Item'], axis=1).corr()
cmap = sns.diverging_palette(220, 10, as_cmap=True)
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap=cmap, fmt=".2f", center = 0, annot_kws={"size": 12}).set_title('Correlation Matrix')

In [None]:
(df['Area'].value_counts() < 400).sum()

In [None]:
country = df['Area'].unique()
yield_per_country = []
for state in country:
    yield_per_country.append(df[df['Area'] == state]['hg/ha_yield'].sum())

In [None]:
df['hg/ha_yield'].sum()

In [None]:
yield_per_country

In [None]:
plt.figure(figsize=(10,25))
sns.barplot(y=country, x=yield_per_country, hue=country, dodge=False)
plt.show()

In [None]:
crops = df['Item'].unique()
yield_per_crop = []
for crop in crops:
    yield_per_crop.append(df[df['Item'] == crop]['hg/ha_yield'].sum())

In [None]:
plt.figure(figsize=(10,25))
sns.barplot(y=crops, x=yield_per_crop, hue=crops, dodge=False)
plt.show()

# **Building the Model**

In [None]:
df.head()

In [None]:
df.columns

In [None]:
col = ['Year', 'average_rain_fall_mm_per_year','pesticides_tonnes', 'avg_temp', 'Area', 'Item', 'hg/ha_yield']

In [None]:
df = df[col]

In [None]:
df.head()

In [None]:
X = df.drop('hg/ha_yield', axis=1)
y = df['hg/ha_yield']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
# Sample 2000 rows from X and y
X_sampled = X.sample(n=2000, random_state=1)
y_sampled = y[X_sampled.index]

# Split the sampled data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, shuffle=True)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

ohe = OneHotEncoder(drop='first')
scale = StandardScaler()
numeric_features = ['Year', 'average_rain_fall_mm_per_year', 'pesticides_tonnes', 'avg_temp']
categorical_features = ['Area', 'Item']
preprocessor = ColumnTransformer(
    transformers=[
        ('StandardScale', StandardScaler(), numeric_features),
        ('OneHotEncode', OneHotEncoder(drop='first', sparse=False), categorical_features)
    ],
    remainder='passthrough'
)

In [None]:
X_train_dummy = preprocessor.fit_transform(X_train)
X_test_dummy = preprocessor.transform(X_test)

In [None]:
preprocessor.get_feature_names_out(col[:-1])

# **Training the Model**

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score

In [None]:
models = {
    'LinearRegression' : LinearRegression(),
    'Decision Tree' : DecisionTreeRegressor(),
    'KNN' : KNeighborsRegressor()
}

scores = []
for name, md in models.items():
    md.fit(X_train_dummy, y_train)
    y_predict = md.predict(X_test_dummy)
    scores.append({
        'Model': name,
        'Scores' : md.score(X_test_dummy, y_test),
        'MAE': mean_absolute_error(y_test, y_predict),
        'R2': r2_score(y_test, y_predict)
    })
    print(f"{name} - Scores: {md.score(X_test_dummy,y_test)} , MAE: {mean_absolute_error(y_test, y_predict)}, R2: {r2_score(y_test, y_predict)}")

In [None]:
#Table views of the Scores, MAE and R2
scores = pd.DataFrame(scores, columns=['Model', 'Scores', 'MAE', 'R2'])

In [None]:
scores

In [None]:
plt.figure(figsize=(6, 6))
ax = sns.barplot(x='Model', y='Scores', data=scores)
ax.bar_label(ax.containers[0])
ax.set_title('Model Scores')

In [None]:
dtr = DecisionTreeRegressor()
dtr.fit(X_train_dummy, y_train)
dtr.predict(X_test_dummy)

In [None]:
df.columns

In [None]:
df.head()

# **Prediction**

In [None]:
def prediction(Year, average_rain_fall_mm_per_year, pesticides_tonnes, avg_temp, Area, Item):
    input_data = pd.DataFrame({
        'Year': [Year],
        'average_rain_fall_mm_per_year': [average_rain_fall_mm_per_year],
        'pesticides_tonnes': [pesticides_tonnes],
        'avg_temp': [avg_temp],
        'Area': [Area],
        'Item': [Item]
    })
    
    # Transform the features
    transform_features = preprocessor.transform(input_data)
    
    # Make prediction
    predicted_yield = dtr.predict(transform_features)
    
    return predicted_yield[0]

In [None]:
result = prediction(2025,2898.0,1597.0,27.57,'Indonesia','Potatoes')

In [None]:
result