In [None]:
import matplotlib as plt
import sklearn as sklearn
import scipy as scipy
import pandas as pd
import numpy as np

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [None]:
training_data_path = ""
data = pd.read_csv(training_data_path)
target_data = pd.read_csv(training_data_path)

# Data Processing


In [None]:
class DataFrameSelector(BaseEstimator):
    
    def __init__(self, attribute_names):
        self.attribute_names= attribute_names
        
    def fit(self,X, y = None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values


data_num = list(data)
#from sklearn.experimental import enable_iterative_imputer
#from sklearn.impute import IterativeImputer
#imputer = IterativeImputer(max_iter = 30, random_state=0)
num_pipeline= Pipeline([
    ('selector', DataFrameSelector(data_num)),
    ('imputer',SimpleImputer(strategy="mean")),
    ('stand_scalar',StandardScaler()),
])


full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline",num_pipeline)
])

data_prepared = full_pipeline.fit_transform(data)

In [None]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
target_data_copy = target_data["TARGET_deathRate"].copy()

data_copy = data.copy()
data_copy["TARGET_deathRate"] = target_data["TARGET_deathRate"]

train_set, test_set = train_test_split(data_copy , test_size= 0.2, random_state=42)

train_set_target = train_set["TARGET_deathRate"].copy()
train_set.drop(("TARGET_deathRate"),axis=1,inplace=True)

test_set_target = test_set["TARGET_deathRate"].copy()
test_set.drop(("TARGET_deathRate"),axis=1,inplace=True)

test_prepared = full_pipeline.fit_transform(test_set)
train_prepared = full_pipeline.fit_transform(train_set)

print(test_prepared)
print(test_set_target)

# OLS

In [None]:
lin_reg=linear_model.LinearRegression()
lin_reg.fit(train_prepared,train_set_target)

data_predictions = lin_reg.predict(test_prepared)
data_mse=mean_squared_error(data_predictions, test_set_target)
data_rmse = np.sqrt(data_mse)
print("OLS rmse:",data_rmse)

# Lasso and Ridge Regression

In [None]:
from sklearn.model_selection import GridSearchCV

print(train_set_target)
model_cv = linear_model.LassoCV(alphas=list(np.arange(0.01,20,0.01)), cv=5).fit(train_prepared,train_set_target)
print("Lasso best alpha:",model_cv.alpha_)
las_reg_a = model_cv.alpha_
#las_reg_a = search.best_params_['model__alpha']
las_reg_a = 0.06


lasso_reg = linear_model.Lasso(alpha=las_reg_a)
lasso_reg.fit(train_prepared,train_set_target)

lasso_data_predictions = lasso_reg.predict(test_prepared)
lasso_data_mse=mean_squared_error(test_set_target, lasso_data_predictions)
lasso_data_rmse = np.sqrt(lasso_data_mse)
print("Lasso rmse:", lasso_data_rmse)

In [None]:
odel_cv = linear_model.RidgeCV(alphas=list(np.arange(0.1,100,0.1)), cv=5).fit(train_prepared,train_set_target)
print("Ridge best alpha:",model_cv.alpha_)
ridge_reg_a = model_cv.alpha_
ridge_reg_a =19.7

ridge_reg = linear_model.Ridge(alpha=ridge_reg_a)
ridge_reg.fit(train_prepared,train_set_target)

ridge_data_predictions = ridge_reg.predict(test_prepared)

ridge_data_mse=mean_squared_error(test_set_target, ridge_data_predictions)
ridge_data_rmse = np.sqrt(ridge_data_mse)
print("Ridge rmse:", ridge_data_rmse)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

logreg.fit(train_prepared,train_set_target)

logistic_data_predictions = logreg.predict(test_prepared)

accuracy = logreg.score(test_set_target, logistic_data_predictions)

# Random Tree

In [None]:

forest_reg=RandomForestRegressor()
#find estimator for random forest regression
target_data_copy = target_data.copy()
param_grid  = [{'n_estimators': [470,430],'max_features':[1,23, 27]}, 
               {'bootstrap': [False],'n_estimators':[30],'max_features':[8]}]


grid_search = GridSearchCV(forest_reg, param_grid, cv=5,scoring='neg_mean_squared_error', verbose = 1)
grid_search.fit(train_prepared,train_set_target)