In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# for operation in data 
import pandas as pd 
import numpy as np 

# for visualization 
import matplotlib.pyplot as plt 
import seaborn as sns 

# for preprocessing 
from sklearn.preprocessing import StandardScaler , LabelEncoder 
from sklearn.preprocessing import LabelEncoder , OneHotEncoder ,MinMaxScaler
from sklearn.impute import SimpleImputer

# to split data
from sklearn.model_selection import train_test_split

# for evaluation 
from sklearn.metrics import accuracy_score,precision_score,recall_score
from sklearn.metrics import confusion_matrix,classification_report ,f1_score

# models 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
df=pd.read_csv('/kaggle/input/healthcare-dataset/healthcare_dataset.csv')
df.head()

In [None]:
df.info()

# check NAN value

In [None]:
df.isna().sum()

# check duplicated rows

In [None]:
df.duplicated().sum()

# Drop duplicates

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.head()

In [None]:
names_count=df.Name.value_counts().sort_values(ascending=False).head(10)
names_count

In [None]:
plt.bar(names_count.index,names_count.values)
plt.xticks(rotation=90)

plt.show()

# change type of date columns

In [None]:
df['Date of Admission']=pd.to_datetime(df['Date of Admission'])
df['Discharge Date']=pd.to_datetime(df['Discharge Date'])
df['duration']=(df['Discharge Date']-df['Date of Admission']).dt.days


In [None]:
df.drop(columns=['Name','Date of Admission','Discharge Date'],inplace=True)

# how many blood type in each class

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df,x='Blood Type',hue='Test Results')

plt.show()

# relationship between medical condition and age

In [None]:
plt.figure(figsize=(15,10))
for index,i in enumerate(df['Medical Condition'].unique()):
    plt.subplot(2,3,index+1)
    sns.distplot(df[df['Medical Condition']==i]['Age'])
    plt.title(i)
    
plt.show()

# which hospital receives more patients

In [None]:
ax=df['Hospital'].value_counts(ascending=False).head(15).plot(kind='bar')
ax.bar_label(ax.containers[0],fontsize=13)
plt.show()


In [None]:
df.head()

# which Insurance Providers have more patients

In [None]:
ax=df['Insurance Provider'].value_counts(ascending=False).plot(kind='bar')
ax.bar_label(ax.containers[0],fontsize=13)
plt.show()

# how many males and females in each Medical Condition 

In [None]:
plt.figure(figsize=(15,10))
for index,i in enumerate(df['Medical Condition'].unique()):
    plt.subplot(2,3,index+1)
    ax=sns.countplot(data=df[df['Medical Condition']==i],x='Gender')
    ax.bar_label(ax.containers[0],fontsize=13)
   
    plt.title(i)
    
plt.show()

# How much medical condition cost 

In [None]:
ax=sns.barplot(x='Medical Condition',y='Billing Amount',data=df)
ax.bar_label( ax.containers[0],fontsize=13)
plt.show()

# check outlier

In [None]:
plt.figure(figsize=(10,9))
sns.boxenplot(data=df['Billing Amount'])
plt.show()

# encoding data

In [None]:
lb_encoder=LabelEncoder()
for col in df.select_dtypes('object'):
    df[col]=lb_encoder.fit_transform(df[[col]])
df.head()

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(numeric_only=True),annot=True,cmap='coolwarm')
plt.show()

In [None]:
df.drop(columns=['Room Number','Hospital','Doctor'],inplace=True)

# split data to x,y


In [None]:
x=df.drop('Test Results',axis=1)
y=df.loc[:,'Test Results']

# scaling x 

In [None]:
scaler=StandardScaler()
x=pd.DataFrame(scaler.fit_transform(x))


# split x,y to train and test

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,
                                              test_size=0.2,shuffle=True
                                              , stratify=y)

# Decision Tree

In [None]:
model_dic={"DT":DecisionTreeClassifier(criterion='entropy'),
          "Logistic_R":LogisticRegression(),
          "NB":GaussianNB(),
          "Random_F":RandomForestClassifier(n_estimators=100)
          }

for key,model in model_dic.items():
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)
    print(key,accuracy_score(y_test,y_pred))

# 