In [0]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error,r2_score,explained_variance_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
#%matplotlib inline

In [0]:
class RegressionModels():

  def preprocessing(self):
    kfolds = KFold(n_splits=5,shuffle=True)
    return kfolds

  def modelTraining(self,clf,X,y,kfolds):
    avg_score = []
    varance_score = []
    r2Score = []
    for train, test in kfolds.split(X):
      X_train, X_test = X.iloc[train], X.iloc[test]
      y_train, y_test = y[train], y[test]
      clf.fit(X_train,y_train)
      y_pred = clf.predict(X_test)
      mse = mean_squared_error(y_test, y_pred)
      score = clf.score(X_test,y_test)
      temp_variance_score = explained_variance_score(y_test, y_pred)
      varance_score.append(temp_variance_score)
      #print("Score " , score)
      r2Score.append(r2_score(y_test,y_pred))
      avg_score.append(math.sqrt(mse))
    print("Mean variance Score " ,np.mean(varance_score))
    print("Mean R2 Score ", np.mean(r2Score))
    return np.mean(avg_score)

  def svmRegression(self, kfolds, X, y, params):
    clf = SVR(gamma='scale', C=2.0, epsilon=0.3)
    mean_accuracy = self.modelTraining(clf,X,y,kfolds)
    print("SVM Regression Mean Accuracy ",mean_accuracy)

  def decisionTreeRegression(self, kfolds, X, y, params):
    DTregressor = DecisionTreeRegressor(random_state=0)
    #scores = cross_val_score(DTregressor, X_train, y_train, cv=5)
    mean_accuracy = self.modelTraining(DTregressor,X,y,kfolds)
    print("Mean accruacy for Decision Tree :", mean_accuracy)

  def randomForestRegression(self, kfolds, X, y, params):
    RDRegressor = RandomForestRegressor(max_depth=2, random_state=0,n_estimators=100)
    mean_accuracy = self.modelTraining(RDRegressor, X, y, params)
    print(RDRegressor.feature_importances_)
    print("Mean accuracy for Random Forest :" , mean_accuracy)
  
  def adaBoostRegression(self, kfolds, X, y, params):
    AdaRegressor = AdaBoostRegressor(random_state=0,n_estimators=100)
    mean_accuracy = self.modelTraining(AdaRegressor, X, y, params)
    print(AdaRegressor.feature_importances_)
    print("Score value : ", AdaRegressor.score(X, y))
    print("Mean accuracy for Random Forest :" , mean_accuracy)

  def gaussianProcessRegression(self, kfolds, X, y, params):
    kernelP = DotProduct() + WhiteKernel()
    GPRRegressor = GaussianProcessRegressor(kernel=kernelP, random_state=0,n_estimators=100)
    mean_accuracy = self.modelTraining(GPRRegressor, X, y, params)
    print(GPRRegressor.score(X, y))
    print("Mean accuracy for Random Forest :" , mean_accuracy)

  def LinearRegression(self, kfolds, X, y, params):
    LinearRegressor = LinearRegression()
    mean_accuracy = self.modelTraining(LinearRegressor, X, y, params)
    print(LinearRegressor.score(X, y))
    print("Mean accuracy for Random Forest :" , mean_accuracy)

  def mlpRegression(self, kfolds, X, y, params):
    MLPRegressor = MLPRegressor(hidden_layer_sizes =(50,100), activation='relu',random_state=0)
    mean_accuracy = self.modelTraining(MLPRegressor, X, y, params)
    print(MLPRegressor.feature_importances_)
    print("Mean accuracy for Random Forest :" , mean_accuracy)

  def train_all__models(self, kfolds, X, y):
    svm_regression_params = {}
    dt_params = {}
    rd_params = {}
    self.svmRegression(kfolds, X, y, svm_regression_params)
    self.decisionTreeRegression(kfolds, X, y, dt_params)
    self.randomForestRegression(kfolds, X, y, rd_params)
    self.adaBoostRegression(kfolds, X, y, rd_params)
    self.gaussianProcessRegression(kfolds, X, y, rd_params)
    self.LinearRegression(kfolds, X, y, rd_params)
    self.mlpRegression(kfolds, X, y, rd_params)

  def wine_quality(self):
    df = pd.read_csv('winequality-red.csv',delimiter=';')
    df.dropna(axis=0,inplace=True)
    print(df.corr()['quality'].drop('quality'))
    X = df[df.columns[0:11]]
    y = df[df.columns[11:12]]
    kfolds = self.preprocessing()
    self.train_all__models(kfolds, X, y.values.ravel())

  def communities(self):
    columns_data = ['state','county','community','communityname','fold','population','householdsize','racepctblack','racePctWhite','racePctAsian','racePctHisp','agePct12t21',
                    'agePct12t29','agePct16t24','agePct65up','numbUrban','pctUrban','medIncome','pctWWage','pctWFarmSelf','pctWInvInc','pctWSocSec','pctWPubAsst','pctWRetire','medFamInc',
                    'perCapInc','whitePerCap','blackPerCap','indianPerCap','AsianPerCap','OtherPerCap','HispPerCap','NumUnderPov','PctPopUnderPov','PctLess9thGrade',
                    'PctNotHSGrad','PctBSorMore','PctUnemployed','PctEmploy','PctEmplManu','PctEmplProfServ','PctOccupManu','PctOccupMgmtProf','MalePctDivorce','MalePctNevMarr',
                    'FemalePctDiv','TotalPctDiv','PersPerFam','PctFam2Par','PctKids2Par','PctYoungKids2Par','PctTeen2Par','PctWorkMomYoungKids','PctWorkMom','NumIlleg','PctIlleg',
                    'NumImmig','PctImmigRecent','PctImmigRec5','PctImmigRec8','PctImmigRec10','PctRecentImmig','PctRecImmig5','PctRecImmig8','PctRecImmig10','PctSpeakEnglOnly','PctNotSpeakEnglWell'
                    ,'PctLargHouseFam','PctLargHouseOccup','PersPerOccupHous','PersPerOwnOccHous','PersPerRentOccHous','PctPersOwnOccup','PctPersDenseHous','PctHousLess3BR',
                    'MedNumBR','HousVacant','PctHousOccup','PctHousOwnOcc','PctVacantBoarded','PctVacMore6Mos','MedYrHousBuilt','PctHousNoPhone','PctWOFullPlumb','OwnOccLowQuart','OwnOccMedVal',
                    'OwnOccHiQuart','RentLowQ','RentMedian','RentHighQ','MedRent','MedRentPctHousInc','MedOwnCostPctInc','MedOwnCostPctIncNoMtg','NumInShelters','NumStreet','PctForeignBorn',
                    'PctBornSameState','PctSameHouse85','PctSameCity85','PctSameState85','LemasSwornFT','LemasSwFTPerPop','LemasSwFTFieldOps','LemasSwFTFieldPerPop','LemasTotalReq','LemasTotReqPerPop',
                    'PolicReqPerOffic','PolicPerPop','RacialMatchCommPol','PctPolicWhite','PctPolicBlack','PctPolicHisp','PctPolicAsian','PctPolicMinor','OfficAssgnDrugUnits','NumKindsDrugsSeiz',
                    'PolicAveOTWorked','LandArea','PopDens','PctUsePubTrans','PolicCars','PolicOperBudg','LemasPctPolicOnPatr','LemasGangUnitDeploy','LemasPctOfficDrugUn','PolicBudgPerPop',
                    'ViolentCrimesPerPop'
                    ]
    df = pd.read_csv('communities.data',delimiter=',',names=columns_data)

    #print("Before Removing the Non Predictable Features \n")
    #print("Shape Before Removing :" +str(df.shape) + "\n")
    #print(df.head())
    df = df.replace('?',np.nan)
    #print("After Removing the Non predictable Features \n ")
    #print("Shape After Removing :" + str(df.shape) + "\n")
    #print(df.head())
    print("Before Droping data Shape " + str(df.shape))
    #According to Dataset Description there are 5 non predictive features which can be removed
    df = df.drop(['fold','community','state','communityname','county'],axis=1)

    print("Checking the Columns Containing the null Values")
    
    #for i in range(0,120,41):
    #  print(df.iloc[:,i:i+41].isna().sum())
    #  print("\n")
    median_value = df.iloc[:,25].median(skipna = True)
    df.iloc[130,25] = median_value
    
    df = df.dropna(axis=1)
    print(df.columns)
    print("After Droping data Shape " + str(df.shape))
    #Replacing the columns with median
  

    print("Number of Missing Values in column is " + str(df.iloc[:,25].isna().sum()))
    print(df.shape)
    X = df[df.columns[0:100]]
    y = df[df.columns[100:101]]
    print(X.describe())
    #print(y)
    kfolds = self.preprocessing()
    #print(X.head())
    self.train_all__models(kfolds, X, y.values.ravel())


In [44]:
regressionModels = RegressionModels()
#regressionModels.wine_quality()
regressionModels.communities()

Before Droping data Shape (1994, 128)
Checking the Columns Containing the null Values
Index(['population', 'householdsize', 'racepctblack', 'racePctWhite',
       'racePctAsian', 'racePctHisp', 'agePct12t21', 'agePct12t29',
       'agePct16t24', 'agePct65up',
       ...
       'PctForeignBorn', 'PctBornSameState', 'PctSameHouse85', 'PctSameCity85',
       'PctSameState85', 'LandArea', 'PopDens', 'PctUsePubTrans',
       'LemasPctOfficDrugUn', 'ViolentCrimesPerPop'],
      dtype='object', length=101)
After Droping data Shape (1994, 101)
Number of Missing Values in column is 0
(1994, 101)
        population  householdsize  ...  PctUsePubTrans  LemasPctOfficDrugUn
count  1994.000000    1994.000000  ...     1994.000000          1994.000000
mean      0.057593       0.463395  ...        0.161685             0.094052
std       0.126906       0.163717  ...        0.229055             0.240328
min       0.000000       0.000000  ...        0.000000             0.000000
25%       0.010000       0