In [269]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import scipy
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import seaborn as sns

In [242]:
raw_data=pd.read_csv('../data/raw_data_upd2.csv')


In [243]:
columns=['dummy','area_cd','item_cd','year','pesticide_vol','avg_rain','avg_temp','pest2','temp2','rain2','pest_rain','pest_temp','rain_temp','pest_rain_temp']
raw_data['dummy']=1
X_data=raw_data[columns]
Y_data=raw_data['tot_yield']/10000

X_data=X_data.to_numpy()
Y_data=Y_data.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X_data,Y_data, test_size=0.3)

In [244]:
X_val, X_test, y_val, y_test=train_test_split(X_test, y_test, test_size=0.3) 

In [245]:
def hyp(X,theta,b):
    h=np.dot(X,theta)+b
    return h

In [246]:
def cost_function(X,theta,b,y):
    a=(y-hyp(X,theta,b))**2
    c=np.sum(a)
    J=c/(2*X.shape[0])
    return J

In [252]:
def compute_gradient(X,theta,b,y):
    m,n = X.shape
    dj_dw=np.zeros((n,))
    dj_db=0
    for i in range(m):
        err=(np.dot(X[i],theta)+b)-y[i]
        for j in range(n):
            dj_dw[j]=(dj_dw[j]+err) * X[i,j]
        #if i<20:
        #    print(i,err,dj_dw,X[i],y[i])
        dj_db=(dj_db+err)
    dj_dw=dj_dw/m
    dj_db=dj_db/m
    return dj_dw,dj_db


In [259]:
def gradient_descent(X,theta_in,b_in,y,alpha,epoch):
    plot_cost=[]
    theta=theta_in
    b=b_in
    for i in range(epoch):
        dj_dw,dj_db=compute_gradient(X,theta,b,y)
        theta=theta-alpha*dj_dw
        b=b-alpha*dj_db
        plot_cost.append(cost_function(X,theta,b,y))
        if i%(epoch/100)==0:
            print(f"iteration {i:4d}: Cost {plot_cost[-1]:8.2f} ")
    return theta,b,plot_cost

In [262]:
initial_w=np.zeros(X_train.shape[1])
initial_b=0
alpha = 5.0e-4
epoch=10000
w_final, b_final, plot_cost = gradient_descent(X_train, initial_w, initial_b, y_train,alpha,epoch)



iteration    0: Cost    62.82 
iteration  100: Cost    58.06 
iteration  200: Cost    54.16 
iteration  300: Cost    50.97 
iteration  400: Cost    48.35 
iteration  500: Cost    46.21 
iteration  600: Cost    44.46 
iteration  700: Cost    43.03 
iteration  800: Cost    41.86 
iteration  900: Cost    40.90 
iteration 1000: Cost    40.11 
iteration 1100: Cost    39.47 
iteration 1200: Cost    38.94 
iteration 1300: Cost    38.51 
iteration 1400: Cost    38.16 
iteration 1500: Cost    37.87 
iteration 1600: Cost    37.63 
iteration 1700: Cost    37.44 
iteration 1800: Cost    37.28 
iteration 1900: Cost    37.15 
iteration 2000: Cost    37.04 
iteration 2100: Cost    36.96 
iteration 2200: Cost    36.89 
iteration 2300: Cost    36.83 
iteration 2400: Cost    36.78 
iteration 2500: Cost    36.74 
iteration 2600: Cost    36.71 
iteration 2700: Cost    36.68 
iteration 2800: Cost    36.66 
iteration 2900: Cost    36.64 
iteration 3000: Cost    36.63 
iteration 3100: Cost    36.62 
iteratio

In [271]:
pred=np.dot(X_val,w_final)+b_final
r2_score(y_val,pred)

-0.0008163126473799132

-0.048458267460461446