# ENVIRONMENT

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.formula.api import ols
from statsmodels.api import qqplot
from statsmodels.tools import add_constant
import statsmodels.api as sm

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LassoCV
from sklearn import preprocessing

# suppress scientific notation in Pandas
pd.options.display.float_format = "{:.2f}".format
pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 100)
pd.set_option("precision", 3)

plt.rcParams["figure.figsize"] = [12, 10]
plt.rcParams["figure.dpi"] = 150

sns.set()
sns.set_context("notebook", rc={"lines.linewidth": 2.5})
sns.set_style("whitegrid")

import warnings

warnings.filterwarnings("ignore")


In [2]:
df = pd.read_csv(
    "../TASK 1 - MULTIPLE REGRESSION FOR PREDICTIVE MODELING/src/medical_clean.csv"
)


In [3]:
df.shape


(10000, 50)

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 50 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   CaseOrder           10000 non-null  int64  
 1   Customer_id         10000 non-null  object 
 2   Interaction         10000 non-null  object 
 3   UID                 10000 non-null  object 
 4   City                10000 non-null  object 
 5   State               10000 non-null  object 
 6   County              10000 non-null  object 
 7   Zip                 10000 non-null  int64  
 8   Lat                 10000 non-null  float64
 9   Lng                 10000 non-null  float64
 10  Population          10000 non-null  int64  
 11  Area                10000 non-null  object 
 12  TimeZone            10000 non-null  object 
 13  Job                 10000 non-null  object 
 14  Children            10000 non-null  int64  
 15  Age                 10000 non-null  int64  
 16  Incom

In [5]:
df.describe()


Unnamed: 0,CaseOrder,Zip,Lat,Lng,Population,Children,Age,Income,VitD_levels,Doc_visits,Full_meals_eaten,vitD_supp,Initial_days,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,50159.32,38.75,-91.24,9965.25,2.1,53.51,40490.5,17.96,5.01,1.0,0.4,34.46,5312.17,12934.53,3.52,3.51,3.51,3.52,3.5,3.52,3.49,3.51
std,2886.9,27469.59,5.4,15.21,14824.76,2.16,20.64,28521.15,2.02,1.05,1.01,0.63,26.31,2180.39,6542.6,1.03,1.03,1.03,1.04,1.03,1.03,1.02,1.04
min,1.0,610.0,17.97,-174.21,0.0,0.0,18.0,154.08,9.81,1.0,0.0,0.0,1.0,1938.31,3125.7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,2500.75,27592.0,35.26,-97.35,694.75,0.0,36.0,19598.78,16.63,4.0,0.0,0.0,7.9,3179.37,7986.49,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0
50%,5000.5,50207.0,39.42,-88.4,2769.0,1.0,53.0,33768.42,17.95,5.0,1.0,0.0,35.84,5213.95,11573.98,4.0,3.0,4.0,4.0,3.0,4.0,3.0,3.0
75%,7500.25,72411.75,42.04,-80.44,13945.0,3.0,71.0,54296.4,19.35,6.0,2.0,1.0,61.16,7459.7,15626.49,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
max,10000.0,99929.0,70.56,-65.29,122814.0,10.0,89.0,207249.1,26.39,9.0,7.0,5.0,71.98,9180.73,30566.07,8.0,7.0,8.0,7.0,7.0,7.0,7.0,7.0


In [6]:
df.head()


Unnamed: 0,CaseOrder,Customer_id,Interaction,UID,City,State,County,Zip,Lat,Lng,Population,Area,TimeZone,Job,Children,Age,Income,Marital,Gender,ReAdmis,VitD_levels,Doc_visits,Full_meals_eaten,vitD_supp,Soft_drink,Initial_admin,HighBlood,Stroke,Complication_risk,Overweight,Arthritis,Diabetes,Hyperlipidemia,BackPain,Anxiety,Allergic_rhinitis,Reflux_esophagitis,Asthma,Services,Initial_days,TotalCharge,Additional_charges,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8
0,1,C412403,8cd49b13-f45a-4b47-a2bd-173ffa932c2f,3a83ddb66e2ae73798bdf1d705dc0932,Eva,AL,Morgan,35621,34.35,-86.73,2951,Suburban,America/Chicago,"Psychologist, sport and exercise",1,53,86575.93,Divorced,Male,No,19.14,6,0,0,No,Emergency Admission,Yes,No,Medium,No,Yes,Yes,No,Yes,Yes,Yes,No,Yes,Blood Work,10.59,3726.7,17939.4,3,3,2,2,4,3,3,4
1,2,Z919181,d2450b70-0337-4406-bdbb-bc1037f1734c,176354c5eef714957d486009feabf195,Marianna,FL,Jackson,32446,30.85,-85.23,11303,Urban,America/Chicago,Community development worker,3,51,46805.99,Married,Female,No,18.94,4,2,1,No,Emergency Admission,Yes,No,High,Yes,No,No,No,No,No,No,Yes,No,Intravenous,15.13,4193.19,17613.0,3,4,3,4,4,4,3,3
2,3,F995323,a2057123-abf5-4a2c-abad-8ffe33512562,e19a0fa00aeda885b8a436757e889bc9,Sioux Falls,SD,Minnehaha,57110,43.54,-96.64,17125,Suburban,America/Chicago,Chief Executive Officer,3,53,14370.14,Widowed,Female,No,18.06,4,1,0,No,Elective Admission,Yes,No,Medium,Yes,No,Yes,No,No,No,No,No,No,Blood Work,4.77,2434.23,17505.19,2,4,4,4,3,4,3,3
3,4,A879973,1dec528d-eb34-4079-adce-0d7a40e82205,cd17d7b6d152cb6f23957346d11c3f07,New Richland,MN,Waseca,56072,43.9,-93.51,2162,Suburban,America/Chicago,Early years teacher,0,78,39741.49,Married,Male,No,16.58,4,1,0,No,Elective Admission,No,Yes,Medium,No,Yes,No,No,No,No,No,Yes,Yes,Blood Work,1.71,2127.83,12993.44,3,5,5,3,4,5,5,5
4,5,C544523,5885f56b-d6da-43a3-8760-83583af94266,d2f0425877b10ed6bb381f3e2579424a,West Point,VA,King William,23181,37.6,-76.89,5287,Rural,America/New_York,Health promotion specialist,1,22,1209.56,Widowed,Female,No,17.44,5,0,2,Yes,Elective Admission,No,No,Low,No,No,No,Yes,No,No,Yes,No,No,CT Scan,1.25,2113.07,3716.53,2,1,3,3,5,3,4,3


# Part I: Research Question 
---

## A.  Describe the purpose of this data mining report by doing the following:



### 1. Propose one question relevant to a real-world organizational situation that you will answer using one of the following classification methods: k-nearest neighbor (KNN), Naive Bayes.
*The submission proposes 1 question that is relevant to a real-world organizational situation, and the proposal includes 1 of the given classification methods.*

### 2. Define one goal of the data analysis. Ensure that your goal is reasonable within the scope of the scenario and is represented in the available data.
*The submission defines 1 reasonable goal for data analysis that is within the scope of the scenario and is represented in the available data.*

# Part II: Method Justification

---

## B.  Explain the reasons for your chosen classification method from part A1 by doing the following:



### 1.  Explain how the classification method you chose analyzes the selected data set. Include expected outcomes.
*The submission logically explains how the chosen classification method analyzes the selected data set and includes accurate expected outcomes.*

### 2.  Summarize one assumption of the chosen classification method.
*The submission adequately summarizes 1 assumption of the chosen classification method.*

### 3.  List the packages or libraries you have chosen for Python or R, and justify how each item on the list supports the analysis.
*The submission lists the packages or libraries chosen for Python or R and justifies how each item on the list supports the analysis.*

# Part III: Data Preparation


---

## C.  Perform data preparation for the chosen data set by doing the following:

### 1.  Describe one data preprocessing goal relevant to the classification method from part A1.
*The submission describes 1 data preprocessing goal that is relevant to the classification method from part A1.*


### 2.  Identify the initial data set variables that you will use to perform the analysis for the classification question from part A1, and classify each variable as continuous or categorical.
*The submission identifies the data set variables used to perform the analysis for the classification question from part A1, and the submission accurately classifies each variable as continuous or categorical.*


### 3.  Explain each of the steps used to prepare the data for the analysis. Identify the code segment for each step.
*The submission accurately explains each step used to prepare the data for analysis, and the submission identifies an accurate code segment for each step.*


### 4.  Provide a copy of the cleaned data set.
*The submission includes an accurate copy of the cleaned data set.*

# Part IV: Analysis


---

## D.  Perform the data analysis and report on the results by doing the following:



### 1.  Split the data into training and test data sets and provide the file(s).
*The submission provides reasonably proportioned training and test data sets.*

### 2.  Describe the analysis technique you used to appropriately analyze the data. Include screenshots of the intermediate calculations you performed.
*The submission accurately describes the analysis technique used to appropriately analyze the data, and the submission includes accurate screenshots of the intermediate calculations performed.*


### 3.  Provide the code used to perform the classification analysis from part D2.
*The submission provides the code used to perform the classification analysis from part D2 and the code executes without errors.*


# Part V: Data Summary and Implications
---

## E.  Summarize your data analysis by doing the following:


### 1.  Explain the accuracy and the area under the curve (AUC) of your classification model.
*The submission logically explains both the accuracy and the AUC of the classification model.*

### 2.  Discuss the results and implications of your classification analysis.
*The submission adequately discusses both the results and implications of the classification analysis.*

### 3.  Discuss one limitation of your data analysis.

*The submission logically discusses 1 limitation of the data analysis with adequate detail.*

#### 4.  Recommend a course of action for the real-world organizational situation from part A1 based on your results and implications discussed in part E2.
*The submission recommends a reasonable course of action for the real-world organizational situation from part A1 based on the results and implications discussed in part E2.*

# Part VI: Demonstration
---

## F.  Provide a Panopto video recording that includes a demonstration of the functionality of the code used for the analysis and a summary of the programming environment.
*The submission provides a Panopto video recording that includes a demonstration of the functionality of the code used for the analysis and a summary of the programming environment.*

## G.  Web Sources

## H. References