# import libraries 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# keeps the plots in one place. calls image as static pngs
%matplotlib inline 
import matplotlib.pyplot as plt # side-stepping mpl backend
import matplotlib.gridspec as gridspec # subplots
import seaborn as sns

# Read data

In [None]:
test=pd.read_csv("/kaggle/input/diabetes-readmission-prediction-i43/test.csv")

In [None]:
df = pd.read_csv("/kaggle/input/diabetes-readmission-prediction-i43/train.csv",header = 0)
df

In [None]:
df.info()

In [None]:
for c in df.columns:
  print(f"{c} :\n",df[c].value_counts())
  print("==============")

In [None]:
val=pd.DataFrame(df["diag_3"].value_counts())
val

In [None]:
val[val['diag_3']>500]

In [None]:
train=df.drop("readmitted", axis=1)


In [None]:
train

In [None]:
test

In [None]:
df_new=pd.concat([train,test])

In [None]:
df_new

In [None]:
for c in df_new.columns:
  print(f"{c} :\n",df_new[c].value_counts())
  print("==============")

# clean data

In [None]:
#drop the encoder_id
df_new.drop("encounter_id",axis=1,inplace=True)

In [None]:
#drop weight column cuz more than 80% is null
df_new.drop("weight",axis=1,inplace=True)

In [None]:
df_new.payer_code=df.payer_code.astype("category")

In [None]:
df_new["payer_code"].replace("?",np.NaN, inplace=True)

In [None]:
df_new['payer_code']

In [None]:
df_new["payer_code"].isna().sum()

In [None]:
# drop payer_code column
df_new.drop("payer_code",axis=1,inplace=True)


In [None]:
df_new.head()

# extra features

In [None]:
#create a column that calculate the number of changes that happened in all the medical tests to every patient
meds = ['metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone']

for m in meds:
    colname = str(m) + 'del'
    df_new[colname] = df_new[m].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)

    df_new['number_changes'] = 0


for m in meds:
    colname = str(m) + 'del'
    df_new['number_changes'] = df_new['number_changes'] + df_new[colname]
    del df_new[colname]
df_new['number_changes'].value_counts()

In [None]:
for col in meds:
    df_new[col] = df_new[col].replace('No', 0)
    df_new[col] = df_new[col].replace('Steady', 1)
    df_new[col] = df_new[col].replace('Up', 1)
    df_new[col] = df_new[col].replace('Down', 1) 

df_new['number_meds'] = 0
for col in meds:
    df_new['number_meds'] = df_new['number_meds'] + df_new[col]
df_new['number_meds'].value_counts()

In [None]:
# drop columns that has no changes 
df_new.drop([ "examide", "citoglipton"],axis=1,inplace=True)

# label encoding

In [None]:
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
  
# label_encoder object knows how to understand word labels.

  
col=['metformin', "repaglinide", "nateglinide", "chlorpropamide", "glimepiride", "acetohexamide", "glipizide", "glyburide",
     "tolbutamide", "pioglitazone", "rosiglitazone", "acarbose", "miglitol", "troglitazone", "tolazamide",
     "insulin", "glyburide-metformin", "glipizide-metformin", "glimepiride-pioglitazone",
     "metformin-rosiglitazone", "metformin-pioglitazone"]
df_new[col]= df_new[col].apply(LabelEncoder().fit_transform)     



In [None]:
df_new.info()

In [None]:
df_new

In [None]:
df_new['change']=df_new['change'].apply(lambda x: 1 if x=='Ch' else 0)
df_new['gender']=df_new['gender'].apply(lambda x: 1 if x=='Female' else 0)
df_new['diabetesMed']=df_new['diabetesMed'].apply(lambda x: 1 if x=='Yes' else 0)


In [None]:
df_new.info()

In [None]:
#change type of age column
df_new["age"].value_counts()

In [None]:
df_new["age"].replace("[0-10)",5, inplace=True)
df_new["age"].replace("[10-20)",15, inplace=True)
df_new["age"].replace("[20-30)",25, inplace=True)
df_new["age"].replace("[30-40)",35, inplace=True)
df_new["age"].replace("[40-50)",45, inplace=True)
df_new["age"].replace("[50-60)",55, inplace=True)
df_new["age"].replace("[60-70)",65, inplace=True)
df_new["age"].replace("[70-80)",75, inplace=True)
df_new["age"].replace("[80-90)",85, inplace=True)
df_new["age"].replace("[90-100)",95, inplace=True)

In [None]:
df_new.info()

 fill missing value in "race, diag_1, diag_2, diag_3, medical_specialty" columns

In [None]:
df_new["race"].unique()

In [None]:
df_new = df_new.replace('?',np.nan)

In [None]:
df_new["race"]=df_new["race"].fillna(df_new["race"].mode()[0])

In [None]:
df_new["race"].isna().sum()

In [None]:
df_new["medical_specialty"].isna().sum()

In [None]:
df_new["medical_specialty"].value_counts()

In [None]:
df_new["medical_specialty"]=df_new["medical_specialty"].fillna("unknown")

In [None]:
df_new["medical_specialty"].value_counts()

In [None]:
top_10 = ['unknown','InternalMedicine','Emergency/Trauma',\
          'Family/GeneralPractice', 'Cardiology','Surgery-General' ,\
          'Nephrology','Orthopedics',\
          'Orthopedics-Reconstructive','Radiologist']


df_new['med_spec'] = df_new['medical_specialty'].copy()


df_new.loc[~df_new.med_spec.isin(top_10),'med_spec'] = 'Other'

In [None]:
df_new['medical_specialty'].unique()

In [None]:
df_new.drop("medical_specialty",axis=1,inplace=True)

In [None]:
'''
df_new["diag_3"].fillna(df_new["diag_2"],inplace=True)
df_new["diag_2"].fillna(df_new["diag_1"],inplace=True)
df_new["diag_1"].fillna(df_new["diag_2"],inplace=True)

df_new["diag_3"].fillna(df_new["diag_2"],inplace=True)
df_new["diag_2"].fillna(df_new["diag_1"],inplace=True)
df_new["diag_1"].fillna(df_new["diag_2"],inplace=True)

df_new["diag_3"].fillna(0,inplace=True)
df_new["diag_2"].fillna(0,inplace=True)
df_new["diag_1"].fillna(0,inplace=True)
'''


In [None]:
'''
def diag_code (x) :
    if 'V' in str(x) or 'E' in str(x):
        return 0
    x = float(x) 
    if (x >= 1) & (x <= 139) :
        return 1
    elif (x >= 140) & (x <= 239):
        return 2
    elif (x >= 240) & (x <= 279) :
        return 3
    elif (x >= 280) & (x <= 289):
        return 4
    elif (x >= 290) & (x <= 319):
        return 5
    elif (x >= 320) & (x <= 389):
        return 6
    elif (x >= 390) & (x <= 459) :
        return 7
    elif (x >= 460) & (x <= 519):
        return 8
    elif (x >= 520) & (x <= 579):
        return 9
    elif (x >= 580) & (x <= 629):
        return 8
    elif (x >= 630) & (x <= 679):
        return 10
    elif (x >= 680) & (x <= 709):
        return 11
    elif (x >= 710) & (x <= 739):
        return 12
    elif (x >= 740) & (x <= 759):
        return 13
    elif (x >= 760) & (x <= 779):
        return 14
    elif (x >= 780) & (x <= 799):
        return 15
    elif (x >= 800) & (x <= 999):
        return 16
    else:
        return 0
        

df_new['diag_code1'] = df_new['diag_1'].apply(lambda x: diag_code(x))
df_new['diag_code2'] = df_new['diag_2'].apply(lambda x: diag_code(x))
df_new['diag_code3'] = df_new['diag_3'].apply(lambda x: diag_code(x))
df_new.drop([ 'diag_1','diag_2','diag_3'],axis=1,inplace=True)

df_new[['diag_code1','diag_code2','diag_code3']]
        

     
'''

In [None]:
df_new["diag_1"].replace("?",np.NaN, inplace=True)
df_new["diag_2"].replace("?",np.NaN, inplace=True)
df_new["diag_3"].replace("?",np.NaN, inplace=True)

In [None]:
df_new["diag_1"].isna().sum()

In [None]:
df_new["diag_2"].isna().sum()

In [None]:
df_new["diag_3"].fillna(df_new["diag_3"].mode()[0],inplace=True)
df_new["diag_2"].fillna(df_new["diag_2"].mode()[0],inplace=True)
df_new["diag_1"].fillna(df_new["diag_1"].mode()[0],inplace=True)

In [None]:
top=["428","414","786","410","486","427","491","715","682","780","434","996","276","38","250.8","599","584",
     "V57","250.6","518","820","493","577","435","574","562","560","296","250.7","440","998","250.13"	]

df_new['dig_1'] = df_new['diag_1'].copy()


df_new.loc[~df_new.dig_1.isin(top),'dig_1'] = 'Other'     

In [None]:
top_2=["428","276","250","427","401","599","496","403","414","411","250.02","707","585","584","491","250.01",
	"285","780","425","682","486","518","424","413","493","250.6","305"	]
df_new['dig_2'] = df_new['diag_2'].copy()


df_new.loc[~df_new.dig_2.isin(top_2),'dig_2'] = 'Other'      

In [None]:
top_3=["250","401","276","428","427","414","496","403","585","272","599","250.02","V45","707","780","285",
       	"425","250.6","424","584","305","250.01","682","518","41"	]
df_new['dig_3'] = df_new['diag_3'].copy()


df_new.loc[~df_new.dig_3.isin(top_3),'dig_3'] = 'Other'  

In [None]:
df_new.drop("diag_3",axis=1,inplace=True)
df_new.drop("diag_2",axis=1,inplace=True)
df_new.drop("diag_1",axis=1,inplace=True)




In [None]:
df_new['max_glu_serum'].value_counts()

In [None]:
df_new["max_glu_serum"].replace("None",0, inplace=True)
df_new["max_glu_serum"].replace("Norm",1, inplace=True)
df_new["max_glu_serum"].replace(">200",2, inplace=True)
df_new["max_glu_serum"].replace(">300",3, inplace=True)




In [None]:
df_new['A1Cresult'].value_counts()

In [None]:
df_new["A1Cresult"].replace("None",0, inplace=True)
df_new["A1Cresult"].replace("Norm",2, inplace=True)
df_new["A1Cresult"].replace(">7",1, inplace=True)
df_new["A1Cresult"].replace(">8",3, inplace=True)

In [None]:
df_new.info()

In [None]:
#change integer type to object 
cols_cat_num = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

df_new[cols_cat_num] = df_new[cols_cat_num].astype('str')

# more extra features
1- count the number of times that the patient vist the hosbital
2- calculate number of all vists to each patient "inpatient_no, outpatient_no, and emergency_no"
3- count the number of days that each patient spend in the hospital
4- calculate number of diagnoses to each patient 
5- calculate number of prodedures to each patient

In [None]:
df_new['Count_Column'] = df_new['patient_nbr'].map(df_new['patient_nbr'].value_counts())
df_new['num_encounters'] = df_new['number_outpatient'] + df_new['number_emergency'] + df_new['number_inpatient']
df_new['is_diabetic'] = df_new.groupby('patient_nbr')["change"].transform("sum")
df_new['count_number_meds'] = df_new.groupby('patient_nbr')["number_meds"].transform("sum")
df_new['count_number_changes'] = df_new.groupby('patient_nbr')["number_changes"].transform("sum")
df_new['count_number_outpatient'] = df_new.groupby('patient_nbr')["number_outpatient"].transform("sum")
df_new['count_number_inpatient'] = df_new.groupby('patient_nbr')["number_inpatient"].transform("sum")
df_new['count_number_emergency'] = df_new.groupby('patient_nbr')["number_emergency"].transform("sum")
df_new['total_num_encounters'] = df_new['count_number_outpatient'] + df_new['count_number_emergency'] + df_new['count_number_inpatient']
df_new['test_1'] = df_new.groupby('patient_nbr')["time_in_hospital"].transform("sum")
df_new['test_2'] = df_new.groupby('patient_nbr')["num_lab_procedures"].transform("sum")
df_new['test_3'] = df_new.groupby('patient_nbr')["num_procedures"].transform("sum")
df_new['test_4'] = df_new.groupby('patient_nbr')["number_diagnoses"].transform("sum")

In [None]:
df_new.reset_index(drop=True, inplace=True)

# one hot encoding

In [None]:
categorical_cols_features = list(df_new.select_dtypes(include="object").columns)
categorical_cols_features

In [None]:
df_new.dtypes

In [None]:
one_hot_encoded_data = pd.get_dummies(df_new, columns = categorical_cols_features)
one_hot_encoded_data


In [None]:
df_new['Count_Column'].max()

In [None]:
df_new

In [None]:
df_new.info()

# split data

In [None]:
x_train=one_hot_encoded_data.iloc[0:71236,:]

In [None]:
x_test=one_hot_encoded_data.iloc[71236:,:]

In [None]:
x_test

In [None]:
y=df.iloc[:,-1:]

In [None]:
y

In [None]:
y["readmitted"].replace("NO",0, inplace=True)
y["readmitted"].replace(">30",2, inplace=True)
y["readmitted"].replace("<30",1, inplace=True)


In [None]:
x_train

In [None]:
y

In [None]:
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE



In [None]:
from pandas.core.common import random_state
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_train, y,test_size=0.2,random_state=42)

In [None]:
#X_train

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler



In [None]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


In [None]:
X_train

In [None]:
from sklearn.metrics import f1_score

# train model

In [None]:

from xgboost import XGBClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score




k_folds = KFold(n_splits = 9)

XGB=XGBClassifier().fit(X_train, y_train)



k_folds = KFold(n_splits = 9)
scores = cross_val_score(XGB, X_train, y_train, cv = k_folds,scoring="f1_micro")
y_pred = XGB.predict(X_test)
f1=f1_score(y_test, y_pred,  average='micro')

print("xgb acc = ",+f1)


print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())
print("Number of CV Scores used in Average: ", len(scores))


In [None]:
x_train = sc.transform(x_train)

In [None]:
XGB=XGBClassifier().fit(x_train, y)

In [None]:
y.shape

# prediction

In [None]:
x_test = sc.transform(x_test)

In [None]:
y_submit_tree = XGB.predict(x_test)

In [None]:

test["readmitted"]=y_submit_tree

In [None]:
test["readmitted"].replace(0,"NO", inplace=True)
test["readmitted"].replace(2,">30", inplace=True)
test["readmitted"].replace(1,"<30", inplace=True)

In [None]:
test["readmitted"].value_counts()

In [None]:
sub=test[["encounter_id","readmitted"]]

In [None]:
sub.to_csv("/kaggle/working/submission.csv", index=False)