In [None]:
import pandas as pd

df = pd.read_csv("combined.csv")
df = df[~df.index.duplicated(keep = 'first')]

In [None]:
df.head(10)

In [None]:
#Identifying the datatypes of all the features
df.dtypes

In [None]:
#Replacing missing values with the mean
df['Number_Weeks_Used'].fillna(df['Number_Weeks_Used'].mean(),inplace = True)

In [None]:
#Function to return plots for the feature
import scipy.stats as stats
import pylab
import matplotlib.pyplot as plt 
import seaborn as sns


def normality(data,feature):
    plt.figure(figsize=(10,5))
    plt.subplot(1,2,1)
    sns.kdeplot(data[feature])
    plt.subplot(1,2,2)
    stats.probplot(data[feature],plot=pylab)
    plt.show()
    
#Converting Estimated Insects Count feature to Normal Distribution using Box-Cox transform
#Plotting to check the transformation
df['Estimated_Insects_Counts'], parameters = stats.boxcox(df['Estimated_Insects_Count'])
normality(df,'Estimated_Insects_Counts')

In [None]:
import numpy as np
df.loc[df['Number_Weeks_Used']>55,'Number_Weeks_Used'] = np.mean(df["Number_Weeks_Used"])
df.loc[df['Estimated_Insects_Count']>3500,'Estimated_Insects_Count'] = np.mean(df["Estimated_Insects_Count"])
df.loc[df['Number_Weeks_Quit']>40,'Number_Weeks_Quit'] = np.mean(df["Number_Weeks_Quit"])
df.loc[df['Number_Doses_Week']>80,'Number_Doses_Week'] = np.mean(df["Number_Doses_Week"])
df.drop(columns = ["Estimated_Insects_Count"], axis = 1, inplace = True)

In [None]:
#Creating predictors and Target
y = df['Crop_Damage']
X = df.drop(columns = ['Crop_Damage'])

#Performing Train Test split using sklearn library
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.65, random_state = 0)

In [None]:
#Let us normalize values for features(Number_Doses_Week,	Number_Weeks_Used,	Number_Weeks_Quit,	Estimated_Insects_Counts)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)

In [None]:
#Checking normalized values by creating a dataframe
from pandas import DataFrame
X_train_df = DataFrame(X_train)
X_train_df.head(10)

In [None]:
#Performed feature encoding to the X_test feature using get_dummies and then transformed
X_test = pd.get_dummies(data = X_test, columns=["Season","Pesticide_Use_Category","Soil_Type","Crop_Type"])
X_test = scaler.transform(X_test)

In [None]:
#Creating a dataframe from normalized values of test dataset
X_test_df = DataFrame(X_test)
X_test_df.head(10)

In [None]:
#Train Test Split
X_train_net, X_test_net, y_train_net, y_test_net = train_test_split(X, y, train_size=0.65, random_state = 0)

In [None]:
#We do encoding for nominal data so I used get_dummies method
X_train_net = pd.get_dummies(data = X_train_net, columns = ["Season","Pesticide_Use_Category","Soil_Type","Crop_Type"])
X_train_net.head(10)

In [None]:
#Let us normalize values for features(Number_Doses_Week,	Number_Weeks_Used,	Number_Weeks_Quit,	Estimated_Insects_Counts)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_net = scaler.fit_transform(X_train_net)

In [None]:
#Importing libraries for Neural Nets
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Input, Embedding, Flatten, Dense, Concatenate
from keras.models import Model
import tensorflow as tf

In [None]:
# Define the keras model

input_dim = X_train_net.shape[1]  # Automatically use correct input size
model = Sequential()
model.add(Dense(24, input_dim=input_dim, activation='relu', kernel_initializer='he_uniform'))
# model = Sequential()
# model.add(Dense(24, input_dim =8, activation = 'relu', kernel_initializer = 'he_uniform'))
model.add(Dense(48, activation = 'relu', kernel_initializer = 'glorot_uniform'))
model.add(Dense(64, activation = 'relu', kernel_initializer = 'he_uniform'))
model.add(Dense(128, activation = 'relu', kernel_initializer = 'glorot_uniform'))
model.add(Dense(96, activation = 'relu', kernel_initializer = 'he_uniform'))
model.add(Dense(64, activation = 'relu', kernel_initializer = 'he_uniform'))
model.add(Dense(3, activation='softmax'))

In [None]:
# Model Summary
model.summary()

In [None]:
#Compile the keras model
model.compile(loss = 'sparse_categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
#Fit the keras model on the dataset
model.fit(X_train_net, y_train_net, epochs = 7, batch_size = 16)

In [None]:
#Evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
#Make probability predictions with the model
predictions = model.predict(X_test)
rounded = [round(x[0]) for x in predictions]
print(rounded)

In [None]:
# Save the ANN model to an HDF5 file
model.save("Bayer_Crop_Science_Nov2021_DSInterview_ANN.h5")
