In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from abc import ABCMeta, abstractmethod

In [None]:
class UtilsForMachineLearning(metaclass=ABCMeta):

  def __init__(self):
    self.__models = []
    self.__xTrain = None
    self.__yTrain = None
    self.__xTest = None
    self.__yTest = None

  def createModel(self, modelClass, modelName, **kwargs):
    model = modelClass(**kwargs)
    self.getModels().append((modelName, model))
   
  def fitAllModels(self):
    try:    
      for modelName, model in self.getModels():
        print("Training...", modelName)
        model.fit(self.getXTrain(), self.getYTrain())
      
      return "Models had trained succesfully"
    except ValueError as error:
      return error

  @abstractmethod
  def predictTestingData(self, dataFrame):
    pass

  @abstractmethod
  def createDfPrediction(self, varType=None):
    pass

  @staticmethod
  def calc_cohen_cappa(dataFrame, predictColName="prediction", answerColName="answer", weights="quadratic"):
    return cohen_kappa_score(dataFrame[predictColName], dataFrame[answerColName], weights=weights)

  def setTestDataFrame(self, xTest, yTest):
    self.__xTest = xTest
    self.__yTest = yTest

  def setTrainDataFrame(self, xTrain, yTrain):
    self.__xTrain = xTrain
    self.__yTrain = yTrain
   
  def getXTrain(self):
    return self.__xTrain
  
  def getYTrain(self):
    return self.__yTrain
  
  def getXTest(self):
    return self.__xTest
  
  def getYTest(self):
    return self.__yTest

  def getModels(self):
    return self.__models
  
class VoterModel(UtilsForMachineLearning):
  
  def createDfPrediction(self, varType=None):
    finalDf = pd.DataFrame(self.getYTest())
    for modelName, model in self.getModels():
      print("Predict...", modelName)
      targetName = "target_" + modelName

      finalDf[targetName] = model.predict(self.getXTest())

      if not varType:
        varType = self.getYTest().dtype
      
      finalDf[targetName] = finalDf[targetName].astype(varType)
      finalDf["weights_" + modelName] = cohen_kappa_score(finalDf[targetName], self.getYTest(), weights="quadratic")

    return finalDf
  
  def predictTestingData(self, dataFrame):
    votes = {}
    final_responses = []

    for _, row in dataFrame.iterrows():
      for col in dataFrame.columns:
        if col.startswith("target"):
          try:
            votes[row[col]] += row[col.replace("target", "weights")]
          except:
            votes[row[col]] = row[col.replace("target", "weights")]

      m_v = max([v for _,v in votes.items()])

      for key, item in votes.items():
        if item == m_v:
          final_responses.append(int(key))\

      votes = {}

    modelAnswersDf = pd.DataFrame({"prediction" : final_responses, "answer" : self.getYTest()})
    return {"dataFrame" : modelAnswersDf, "cohhen_cappa_score" : VoterModel.calc_cohen_cappa(modelAnswersDf)}

class LGBPredictionModel(UtilsForMachineLearning):

  def createDfPrediction(self, varType=None):
    finalDf = pd.DataFrame(self.getYTrain())
    for modelName, model in self.getModels():
      print("Predict...", modelName)
      targetName = "target_" + modelName

      finalDf[targetName] = model.predict(self.getXTrain())

      if not varType:
        varType = self.getYTrain().dtype
      
      finalDf[targetName] = finalDf[targetName].astype(varType)
      finalDf["weights_" + modelName] = cohen_kappa_score(finalDf[targetName], self.getYTrain(), weights="quadratic")

    return finalDf
  
  def predictTestingData(self, dataFrame):
    fitDataFrame = dataFrame.drop(columns=pd.DataFrame(self.getYTrain()).columns)
    print(fitDataFrame)

    lgbModel = lgb.LGBMRegressor()
    lgbModel.fit(fitDataFrame, self.getYTrain())

    finalDf = pd.DataFrame()
    for modelName, model in self.getModels():
      targetName = "target_" + modelName

      finalDf[targetName] = model.predict(self.getXTest())

      varType = self.getYTest().dtype
      
      finalDf[targetName] = finalDf[targetName].astype(varType)
      finalDf["weights_" + modelName] = dataFrame["weights_" + modelName]
      
    prediction = np.round(lgbModel.predict(finalDf))

    modelAnswersDf = pd.DataFrame({"prediction" : prediction, "answer" : self.getYTest()})

    return {"dataFrame" : modelAnswersDf, "cohhen_cappa_score" : LGBPredictionModel.calc_cohen_cappa(modelAnswersDf)}



In [None]:
train = pd.read_csv("http://video.ittensive.com/machine-learning/prudential/train.csv.gz")

train["Product_Info_2_1"] = train["Product_Info_2"].str.slice(0, 1)
train["Product_Info_2_2"] = pd.to_numeric(train["Product_Info_2"].str.slice(1, 2))

for i in train["Product_Info_2_1"]:
  train["Product_Info_2_1" + i] = train["Product_Info_2_1"].isin([i]).astype("int8")

train.drop(labels=["Product_Info_2", "Product_Info_2_1"], axis=1, inplace=True)

train["Employment_Info_1"].fillna(value=0, inplace=True)
train["Employment_Info_4"].fillna(value=0, inplace=True)
train["Employment_Info_6"].fillna(value=0, inplace=True)
train["Insurance_History_5"].fillna(value=0, inplace=True)
train["Family_Hist_2"].fillna(value=0, inplace=True)
train["Family_Hist_3"].fillna(value=0, inplace=True)
train["Family_Hist_4"].fillna(value=0, inplace=True)
train["Family_Hist_5"].fillna(value=0, inplace=True)
train["Medical_History_1"].fillna(value=0, inplace=True)
train["Medical_History_10"].fillna(value=0, inplace=True)
train["Medical_History_15"].fillna(value=0, inplace=True)
train["Medical_History_24"].fillna(value=0, inplace=True)
train["Medical_History_32"].fillna(value=0, inplace=True)
train["Employment_Info_1"].fillna(value=0, inplace=True)
train["Employment_Info_1"].fillna(value=0, inplace=True)
train["Employment_Info_1"].fillna(value=0, inplace=True)
train["Employment_Info_1"].fillna(value=0, inplace=True)

In [None]:
column_groups = ["Insurance_History", "InsuredInfo", "Medical_Keyword", "Family_Hist", "Medical_History", "Product_Info"]
columns = ["Wt", "Ht", "Ins_Age", "BMI"]

for cg in column_groups:
  columns.extend(train.columns[train.columns.str.startswith(cg)])

scaler = preprocessing.StandardScaler()
scaler.fit(pd.DataFrame(train, columns=columns))

data_train, data_test = train_test_split(train, test_size=0.2)

data_train = pd.DataFrame(data_train)
data_test = pd.DataFrame(data_test)

x_train = pd.DataFrame(data_train, columns=columns)
y_train = data_train["Response"]

x_test = pd.DataFrame(data_test, columns=columns)
y_test = pd.DataFrame(data_test["Response"])

In [None]:
voterModel = VoterModel()

voterModel.setTrainDataFrame(x_train, y_train)
voterModel.setTestDataFrame(x_test, data_test["Response"])

voterModel.createModel(lgb.LGBMRegressor, "lgb", max_iter=1000)
voterModel.createModel(KNeighborsClassifier, "knn_10", n_neighbors=10)
voterModel.createModel(LogisticRegression, "log_regr", class_weight="balanced", multi_class="multinomial")
voterModel.createModel(LinearSVC, "svc", max_iter=1000)
voterModel.createModel(SGDClassifier, "sgd", max_iter=10000)
voterModel.createModel(DecisionTreeClassifier, "tree")
voterModel.createModel(RandomForestClassifier, "random_forest")
voterModel.createModel(XGBClassifier, "xgb")

print(voterModel.fitAllModels())
finalDataFrame = voterModel.createDfPrediction()
voterAnswers = voterModel.predictTestingData(finalDataFrame)

In [None]:
print(voterAnswers)

In [None]:
lgbModel = LGBPredictionModel()

lgbModel.setTrainDataFrame(x_train, y_train)
lgbModel.setTestDataFrame(x_test, data_test["Response"])

lgbModel.createModel(lgb.LGBMRegressor, "lgb", max_iter=1000)
lgbModel.createModel(KNeighborsClassifier, "knn_10", n_neighbors=10)
lgbModel.createModel(LogisticRegression, "log_regr", class_weight="balanced", multi_class="multinomial")
lgbModel.createModel(LinearSVC, "svc", max_iter=1000)
lgbModel.createModel(SGDClassifier, "sgd", max_iter=10000)
lgbModel.createModel(DecisionTreeClassifier, "tree")
lgbModel.createModel(RandomForestClassifier, "random_forest")
lgbModel.createModel(XGBClassifier, "xgb")

print(lgbModel.fitAllModels())

In [None]:
finalDataFrame = lgbModel.createDfPrediction()
lgbAnswers = lgbModel.predictTestingData(finalDataFrame)

In [None]:
print(lgbAnswers)