In [3]:
from openpyxl import Workbook
from openpyxl import load_workbook
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from sklearn import metrics
from sklearn import linear_model
import statsmodels.api as sm
from sklearn import metrics
from random import shuffle

In [None]:
data = pd.read_csv("data.csv")
data = data.sample(frac=1).reset_index(drop=True)
pd.options.mode.chained_assignment = None


XLabels = ['TotalT','Temp','LSR','CA','Size', 'Moisture', 'IsoT', 'HeatT', 'Ramp','F_X', 'Ro', 'logRo', 'P']
X = data[XLabels]
#Scaling X
sc = StandardScaler()
X = sc.fit_transform(X)
data[XLabels] = X

papers = data['ID'].unique()
shuffle(papers)

#Remvoing papers from test list if they have less  than 10 points
papersWithLessThanXPoints = []
for paper in papers:
    dataFromPaper = data[data['ID'] == paper]
    if len(dataFromPaper.index) < 10:
        papersWithLessThanXPoints.append(paper)
papers = [x for x in papers if x not in papersWithLessThanXPoints]
        
# print(papersWithLessThanXPoints)
numPapers = len(papers)
papersPerGroup = 2
numBins = 10

error_Frame = pd.DataFrame(columns=['ID', 'Linear', 'Ridge', 'SVR'])

for paper in papers:
    print(paper)
    #Simple Linear Regr
    train_Frame = data[data['ID'] != paper]
    test_Frame = data[data['ID'] == paper]
    
    papers = train_Frame['ID'].unique()
    
    
    numPapers = len(papers)
    #combos = [papers[x:x + papersPerGroup] for x in range(0, len(papers), papersPerGroup)] 
    combos = []
    for x in range(0, len(papers), papersPerGroup):
        if x + papersPerGroup < len(papers):
            combos.append(papers[x:x + papersPerGroup])
        else:
            combos.append(papers[x:])


    lenTrain = len(train_Frame.index)
    train_Frame, valid_Frame, train_valid_Frame = train_Frame.iloc[:int(lenTrain * 0.8), :], train_Frame.iloc[int(lenTrain * 0.8):, :], train_Frame

    #Calculating Sample Weight
    bins = train_Frame['Yield'].value_counts(bins=numBins)
    for i in train_Frame.index:
        for j in bins.index:
            if int(train_Frame.at[i, 'Yield']) in j:
                train_Frame.at[i, 'Sample_Weight'] = 100/bins[j].item()

            
    bins = train_valid_Frame['Yield'].value_counts(bins=numBins)

    for i in train_valid_Frame.index:
        for j in bins.index:
            if int(train_valid_Frame.at[i, 'Yield']) in j:
                train_valid_Frame.at[i, 'Sample_Weight'] = 100/bins[j].item()
    
    y_train, y_valid, y_test, y_train_valid = train_Frame['Yield'], valid_Frame['Yield'], test_Frame['Yield'], train_valid_Frame['Yield']
    X_train, X_valid, X_test, X_train_valid = train_Frame[XLabels], valid_Frame[XLabels], test_Frame[XLabels], train_valid_Frame[XLabels]

    train_weights = train_Frame['Sample_Weight']
    train_valid_weights = train_valid_Frame['Sample_Weight']
    
    #Simple Linear Regression
    regr = linear_model.LinearRegression()
    regr.fit(X_train_valid, y_train_valid, sample_weight=train_valid_weights)
    y_pred = regr.predict(X_test)
    mseSLR = metrics.mean_absolute_error(y_test, y_pred)
    
    #Ridge ------
    
    alphas = [0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 100]
    
    errors = []
    
    for a in alphas:
        sumErrors = 0
        for c in combos:
            train_Frame = train_valid_Frame[~train_valid_Frame['ID'].isin(c)]
            valid_Frame = train_valid_Frame[train_valid_Frame['ID'].isin(c)]

            ##Calculating Sample Weight again just for train frame, because it's different
            bins = train_Frame['Yield'].value_counts(bins=numBins)

            for i in train_Frame.index:
                for j in bins.index:
                    if int(train_Frame.at[i, 'Yield']) in j:
                        train_Frame.at[i, 'Sample_Weight'] = 100/bins[j].item()

            y_train, y_valid, y_test, y_train_valid = train_Frame['Yield'], valid_Frame['Yield'], test_Frame['Yield'], train_valid_Frame['Yield']
            X_train, X_valid, X_test, X_train_valid = train_Frame[XLabels], valid_Frame[XLabels], test_Frame[XLabels], train_valid_Frame[XLabels]

            train_weights = train_Frame['Sample_Weight']
            

            ridgeModel = Ridge(alpha=a)
            ridgeModel.fit(X_train, y_train, sample_weight=train_weights) 
            y_pred = ridgeModel.predict(X_valid)
            error =  metrics.mean_absolute_error(y_valid, y_pred)
            sumErrors = sumErrors + error
        
        errors.append(sumErrors)

    
    best_alpha = alphas[np.argmin(errors)]
    print("Best alpha is: ", best_alpha)
    ridgeModel = Ridge(alpha=best_alpha)
    ridgeModel.fit(X_train_valid, y_train_valid, sample_weight=train_valid_weights) 
    y_pred = ridgeModel.predict(X_test)
    mseRidge =  metrics.mean_absolute_error(y_test, y_pred)
    
    
    #SVR------------------------------
    ##Parameter Fitting
    kernels =  ['poly', 'rbf', 'linear']
    epsilons = [0.1,5,10, 20]
    Cs = [0.1,1,10, 20]
    gammas = ['scale', 'auto']
    errors = []
    for kern in kernels:
        for ep in epsilons:
            for C_ in Cs:
                for gam in gammas:
                    sumErrors = 0
                    for c in combos:
                        train_Frame = train_valid_Frame[~train_valid_Frame['ID'].isin(c)]
                        valid_Frame = train_valid_Frame[train_valid_Frame['ID'].isin(c)]


                        ##Calculating Sample Weight again just for train frame, because it's different
                        bins = train_Frame['Yield'].value_counts(bins=numBins)
                        for i in train_Frame.index:
                            for j in bins.index:
                                if int(train_Frame.at[i, 'Yield']) in j:
                                    train_Frame.at[i, 'Sample_Weight'] = 100/bins[j].item()

                        y_train, y_valid, y_test, y_train_valid = train_Frame['Yield'], valid_Frame['Yield'], test_Frame['Yield'], train_valid_Frame['Yield']
                        X_train, X_valid, X_test, X_train_valid = train_Frame[XLabels], valid_Frame[XLabels], test_Frame[XLabels], train_valid_Frame[XLabels]

                        train_weights = train_Frame['Sample_Weight']


                        svrModel = SVR(kernel=kern, gamma='scale', epsilon=ep, cache_size=2000, C=C_)
                        svrModel.fit(X_train, y_train, sample_weight=train_weights) 
                        y_pred = svrModel.predict(X_valid)
                        error =  metrics.mean_absolute_error(y_valid, y_pred)
                        sumErrors = sumErrors + error
                    errors.append(sumErrors)
                
    index_of_lowest_error = np.argmin(errors)

    best_kernel = kernels[int(index_of_lowest_error / (len(epsilons) * len(Cs) * len(gammas)))] #Good
    best_ep = epsilons[int((index_of_lowest_error % (len(epsilons) *len(Cs) * len(gammas)))/(len(Cs) *len(gammas)))] #Good
    best_C = Cs[int((index_of_lowest_error % (len(Cs) * len(gammas)))/len(gammas))] #Good 
    best_gamma = gammas[i % len(gammas)]
    
    print("Best kernel is: ", best_kernel)
    print("Best Epsilon is: ", best_ep)
    print("Best C is: ", best_C)
    print("Best Gamma is: ", best_gamma)
    print("Error in valid:", errors[np.argmin(errors)])
    
    best_model = SVR(kernel=best_kernel, gamma='scale', epsilon=best_ep, cache_size=2000, C=best_C)
    best_model.fit(X_train_valid, y_train_valid, sample_weight=train_valid_weights)
    y_pred = best_model.predict(X_test)
    mseSVR = metrics.mean_absolute_error(y_test, y_pred)


    row = [[paper, mseSLR, mseRidge, mseSVR]]
    tempDf = pd.DataFrame(row, columns=['ID', 'Linear', 'Ridge', 'SVR'])
    error_Frame = pd.concat([error_Frame, tempDf], ignore_index=True)
    error_Frame.index = error_Frame['ID'].values
    error_Frame = error_Frame.sort_index()
    error_Frame.to_csv("CrossValidLinearRidgeSVRErrors.csv")

    
error_Frame.reset_index()
error_Frame.index = error_Frame['ID'].values
error_Frame = error_Frame.sort_index()


error_Frame.plot(kind='bar', stacked=False)
error_Frame.to_csv("CrossValidLinearRidgeSVRErrors.csv")

plt.ylabel('Mean Absolute Error (Yield %)')
plt.title("Accuracy of Models")
plt.savefig("CrossValidLinearRidgeSVR.png", dpi=600, bbox_inches='tight')


O


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
Best kernel is:  linear
Best Epsilon is:  5
Best C is:  1
Best Gamma is:  auto
Error in valid: 146.87233707069586
I


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
Best kernel is:  linear
Best Epsilon is:  0.1
Best C is:  0.1
Best Gamma is:  auto
Error in valid: 149.38327378322853
G


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
Best kernel is:  linear
Best Epsilon is:  0.1
Best C is:  0.1
Best Gamma is:  auto
Error in valid: 142.94348493272207
E


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
Best kernel is:  rbf
Best Epsilon is:  0.1
Best C is:  20
Best Gamma is:  auto
Error in valid: 132.69662578220914
K


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
Best kernel is:  linear
Best Epsilon is:  5
Best C is:  0.1
Best Gamma is:  auto
Error in valid: 147.81055194034874
A


  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T
  overwrite_a=True).T


Best alpha is:  100
