In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib notebook

In [39]:
from warnings import simplefilter,filterwarnings
from sklearn.exceptions import DataConversionWarning
simplefilter(action='ignore',category = FutureWarning)
simplefilter(action='ignore',category = RuntimeWarning)
filterwarnings(action='ignore',category=DataConversionWarning)

## 0.0.1 Import Data

In [40]:
raw_data = pd.read_csv("corn_data.csv")
raw_data.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,1126,1127,1128,1129,1130,1131,1132,1133,1134,output
0,0,1,-1,1,1,1,0,1,-1,1,...,-1,1,-1,1,1,-1,0,0,-1,2.516485
1,1,1,-1,1,1,-1,-1,1,-1,1,...,-1,1,-1,0,1,-1,-1,1,0,1.640909
2,2,0,1,1,-1,-1,-1,1,-1,-1,...,-1,1,-1,1,1,-1,-1,1,0,0.416348
3,3,1,1,1,0,-1,1,1,-1,1,...,-1,1,-1,-1,1,-1,-1,1,0,1.338558
4,4,1,1,1,0,-1,-1,1,0,1,...,-1,1,-1,0,-1,-1,1,1,0,1.729517


In [41]:
X = raw_data.iloc[:,1:-1]
y = raw_data.iloc[:,-1:]
print("X shape:",X.shape)
print("y shape:",y.shape)
print("Min y:",min(np.array(y))[0])
print("Max y:",max(np.array(y))[0])

X shape: (264, 1135)
y shape: (264, 1)
Min y: 0.0117532464005208
Max y: 4.9270454514617015


## 0.0.2 Visualize our data

We would like to visualize how the values of the output are distributed, using a histogram. 

Since our output values range from 0.01 to 4.92, it makes sense to display our output in bins from [0-1] to [4-5]

In [42]:
hist, bin_edges = np.histogram(y, bins = 5, range = (0,5))
fig = plt.figure()
plt.bar(bin_edges[:-1],hist,width=.98,align='edge')
plt.title("Output Distribution")
plt.show()

<IPython.core.display.Javascript object>

Remark: Most of our values falls within [1-3]. The highest yields [4-5], unsurprisingly, are the fewest

## 0.0.3 Construct a Scoring system

### i- Scorer class
First construct a scoring class. This will keep track of all our scores.

- v1: 

In [43]:
class ScoreSheet:
    def __init__(self,title):
        self.title=title
        self.sheet = pd.DataFrame({})
        
    def add_score(self,name,list):
        self.sheet.loc[name]=list
        
    def add_score(self,series):
        self.sheet = self.sheet.append(series)
        
    def get_pred_labels(self):
        return list(self.sheet.columns)
    
    def get_test_labels(self):
        return list(self.sheet.index)
    
    def get_test(self,name):
        return self.sheet.loc[name]
    
    def get_predictor(self,name):
        return list(self.sheet[name])
    
    def get_sheet(self):
        return self.sheet
    
    

### ii - Drawer class

This class will help with the plotting of graphs in the Scorer

- v1.0

In [78]:
class Draw:
    def heatmap(ss):
        ax=sns.heatmap(ss.get_sheet(),annot=True, fmt=".3f",cmap="coolwarm", center = -0.1 ,
                       square = True,linewidths=.1,vmin=-1,vmax=1,cbar=False)
        plt.title(ss.title)
    
    def lineplot(df,cols):
        ax = plt.subplot()
        for name in cols:
            x = df.get_test_labels()
            y = df.get_predictor(name)
            plt.plot(x,y)
            ax.set_ylim([0.0,1.0])
        ax.legend()
        plt.show()
        

### iii - Test class

In [79]:
from random import random
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split  
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import AdaBoostRegressor

class Test:
    def __init__(self,X,y):
        self.X = X
        self.y = y
        
    def y_stratified(self):
        return np.array(self.y,dtype="int")

    def predictor(self,name):
        pred = {"SVR":SVR(),'RFR':RandomForestRegressor(),
               "XGB":XGBRegressor(),"ADA":AdaBoostRegressor()}
        return pred[name]
  
    # Determine the R2 Score for a list of predictors
    def R2(self,predictors):
        train_r2 = []
        test_r2 = []
        for name in predictors:
            clf = self.predictor(name)
            X_train, X_test, y_train, y_test = train_test_split(self.X,self.y)
            clf.fit(X_train,y_train)
            y_predict_tr = clf.predict(X_train)
            y_predict    = clf.predict(X_test)
            train_r2.append(r2_score(y_train,y_predict_tr))
            test_r2.append(r2_score(y_test,y_predict))
        return pd.Series(train_r2,index=predictors,name="Train"),pd.Series(test_r2,index=predictors,name="Test")
    
    def R2_avg(self,predictors,iterations):
        all_train_r2 = []
        all_test_r2 = []
        for i in range(iterations):
            train_r2,test_r2 = self.R2(predictors)
            all_train_r2.append(list(train_r2))
            all_test_r2.append(list(test_r2))
        mean_train_r2 = [np.mean(i) for i in np.array(all_train_r2).T]
        mean_test_r2 = [np.mean(i) for i in np.array(all_test_r2).T]
        return pd.Series(mean_train_r2,index=predictors,name="Train"),pd.Series(mean_test_r2,index=predictors,name="Test")

In [80]:
test1 = Test(X,y)

In [81]:
ss = ScoreSheet("Training")
train,test = test1.R2(["ADA","RFR","SVR","XGB"])
ss.add_score(train)
ss.add_score(test)
ss.get_sheet()

Unnamed: 0,ADA,RFR,SVR,XGB
Train,0.771018,0.842936,0.754266,1.0
Test,0.138885,0.111737,0.139858,-0.05343


In [84]:
test = Test(X,y)
ss = ScoreSheet("Average Raw R2")
train_avg,test_avg = test.R2_avg(["ADA","RFR","SVR","XGB"],5)
ss.add_score(train_avg)
ss.add_score(test_avg)
ss.get_sheet()

Unnamed: 0,ADA,RFR,SVR,XGB
Train,0.762144,0.829818,0.73962,1.0
Test,0.034405,0.074942,0.162547,-0.05317


In [85]:
Draw.heatmap(ss)

<IPython.core.display.Javascript object>

In [None]:
Draw.lineplot(ss,["SVR","XGB","RFR","ADA","GB"])