In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>   ✨Complete Machine Learning Project  </b></font>
</div>

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>   📜 Data Description  </b></font>
</div>


| Attribute           | Description                                                  |
|---------------------|--------------------------------------------------------------|
| **id**              | Unique identifier                                           |
| **gender**          | "Male", "Female" or "Other"                                 |
| **age**             | Age of the patient                                          |
| **hypertension**    | 0 if the patient doesn't have hypertension, 1 if they do    |
| **heart_disease**   | 0 if the patient doesn't have any heart diseases, 1 if they do |
| **ever_married**    | "No" or "Yes"                                               |
| **work_type**       | "Children", "Govt_job", "Never_worked", "Private" or "Self-employed" |
| **Residence_type**  | "Rural" or "Urban"                                          |
| **avg_glucose_level** | Average glucose level in blood                               |
| **bmi**             | Body mass index                                             |
| **smoking_status**  | "Formerly smoked", "Never smoked", "Smokes" or "Unknown"     |
| **stroke**          | 1 if the patient had a stroke, 0 if not                      |


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>🤔What steps should a machine learning engineer follow for a classification problem? </b></font>
</div>

<div style="border-radius:10px; padding: 15px; background-color: #e79a73; font-size:130%; text-align:left">

<h2 align="left"><font color=#005500>Machine Learning Problem Solving Workflow:</font></h2>
1. **Problem Understanding:**
    - Understand the nature of the problem.
    - Have knowledge about the attributes of the data.

2. **Data Inspection:**
    - Determine the size of the dataset.
    - Examine the data to understand the tasks it is currently performing.

3. **Data Types and Quality Check:**
    - Verify the correctness of data types.
    - Check for any missing values in the dataset.
    - Identify and handle duplicate entries.

4. **Exploratory Data Analysis (EDA):**
    - Inspect data to ensure the correctness of types and distributions.
    - Examine occurrence frequencies for each column.

5. **Correlation Analysis:**
    - Evaluate the correlation between input features and the output variable.
    - Explore correlations among input variables.

6. **Outlier Detection:**
    - Identify and handle outliers in the dataset.

7. **Feature Engineering:**
    - Identify and drop unnecessary columns based on domain knowledge or techniques.
    - Encode categorical columns using techniques like Label Encoding, Ordinal Encoding, or One-Hot Encoding.

8. **Normalization (if necessary):**
    - Normalize the data if required by the chosen machine learning model.

9. **Model Selection:**
    - Implement various machine learning models to evaluate performance.
    - Select the model that best fits the problem and dataset.

10. **Model Deployment:**
    - Deploy the selected model for future predictions.

<div style="border-radius:10px; padding: 15px; background-color: #e79a73; font-size:130%; text-align:left">

<h2 align="left"><font color=#005500> Problem Statement: Predicting Stroke Risk in Patients</font></h2>


**Objective:**
Develop a predictive model to assess the likelihood of a patient experiencing a stroke based on demographic information, health conditions, lifestyle factors, residence details, and health metrics.

**Data Attributes:**
- **Demographics:** Gender, age.
- **Health Conditions:** Hypertension, heart disease.
- **Lifestyle:** Marital status, work type.
- **Residence Details:** Rural or urban.
- **Health Metrics:** Average glucose level, BMI.
- **Smoking Habits:** Smoking status.

**Target Variable:**
The target variable is **stroke** (1 if the patient had a stroke, 0 if not).

**Significance:**
The model aims to provide healthcare professionals with insights for early intervention and personalized patient care.

---


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>   ✨Let's Get Started  </b></font>
</div>

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b> 📒 Import all Liabraries  </b></font>
</div>


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from matplotlib.colors import ListedColormap
from matplotlib import colors
import warnings
warnings.filterwarnings("ignore")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>  🔔 Load DataSet </b></font>
</div>


In [None]:
df=pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df.head()

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>  🕵️‍♀️Basic Data Explorations </b></font>
</div>



In [None]:
df.describe()

In [None]:
print(df.info())
print("__________________________________________")
print("Data Shape is",df.shape)
print("__________________________________________")
print("**Missing Values**")
print(df.isnull().sum())
print("__________________________________________")
print("There is",df.duplicated().sum(),"duplicates")



<div style="border-radius: 10px; padding: 15px; background-color: #e79a73; font-size: 130%; text-align: left">

<h2 align="left"><font color="#005500">Conclusion</font></h2>

In this dataset containing 5110 rows and 12 columns, we observed that crucial columns such as **gender**, **ever_married**, **work_type**, **Residence_type**, and **smoking_status** have appropriate data types. However, to ensure better compatibility during the model training process, we plan to convert them to integers. Additionally, the **age** column, currently in float format, will be converted to an integer for more efficient analysis.

Columns related to health indicators (**hypertension**, **heart_disease**, and **stroke**) are presently in integer format. To facilitate a more detailed analysis, we will convert these columns to object type.

On the positive side, columns **avg_glucose_level** and **BMI** already have suitable data types and do not require any changes.

In terms of missing values, only the **BMI** column contains 201 missing values. Fortunately, there are no duplicate entries in the dataset.

This data preprocessing will enhance the dataset's compatibility with machine learning models, ensuring a more effective and accurate analysis.

</div>


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>  Missing Valuse Imputation </b></font>
</div>


In [None]:
df['bmi'].fillna(df['bmi'].median(), inplace=True)
df.isnull().sum()

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>  Change Data types </b></font>
</div>


In [None]:
df['age'] = df['age'].astype(int)#float to int
df['hypertension'] = df['hypertension'].astype(object)#int to object
df['heart_disease'] = df['heart_disease'].astype(object)#int to object
df['stroke'] = df['stroke'].astype(object)#int to object


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>📊Univariate analysis of categorical columns🤩</b></font>
</div>


In [None]:
categorical_columns = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']

# Create the subplots based on the number of categorical columns
fig = make_subplots(rows=2, cols=4, specs=[[{'type':'domain'}]*4]*2, vertical_spacing=0.1, horizontal_spacing=0.01)

# Loop through all the features and add the pie chart to the subplot
for i, feature in enumerate(categorical_columns):
    value_counts = df[feature].value_counts()
    labels = value_counts.index.tolist()
    values = value_counts.values.tolist()

    # Define color map based on purple color
    cmap = colors.LinearSegmentedColormap.from_list("Purple", ["Purple", "white"])
    norm = colors.Normalize(vmin=0, vmax=len(labels))
    color_list = [colors.rgb2hex(cmap(norm(i))) for i in range(len(labels))]

    pie_chart = go.Pie(
        labels=labels,
        values=values,
        hole=0.6,
        marker=dict(colors=color_list, line=dict(color='white', width=3)),
        textposition='inside',
        textinfo='percent+label',
        title=feature,  # Add title with the feature name
        title_font=dict(size=25, color='black', family='Calibri')
    )

    # Calculate the row and column indices based on the total number of subplots
    row_index = (i // 4) + 1
    col_index = (i % 4) + 1

    # Add the pie chart to the subplot
    fig.add_trace(pie_chart, row=row_index, col=col_index)

# Update the layout
fig.update_layout(showlegend=False, height=600, width=990,
                   title={
                          'text':"Distribution of Categorical Variables",
                          'y':0.92,
                          'x':0.5,
                          'xanchor':'center',
                          'yanchor':'top',
                          'font': {'size':28, 'color':'black', 'family':'Calibri'}
                         })

# Show the plot
fig.show()


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>📊Univariate analysis of Numeric columns🤩</b></font>
</div>


In [None]:
numerical_columns = ['age', 'avg_glucose_level', 'bmi']

# Create a subplot with three histograms
fig = make_subplots(rows=1, cols=3, subplot_titles=numerical_columns)

# Define color map based on purple color
cmap = colors.LinearSegmentedColormap.from_list("Purple", ["Purple", "white"])
norm = colors.Normalize(vmin=0, vmax=len(numerical_columns))
color_list = [colors.rgb2hex(cmap(norm(i))) for i in range(len(numerical_columns))]

for i, col in enumerate(numerical_columns, 1):
    # Add a histogram for each numerical column with purple color
    fig.add_trace(go.Histogram(x=df[col], xbins=dict(size=5), name=col, marker=dict(color=color_list[i-1])), row=1, col=i)

# Update the layout
fig.update_layout(showlegend=False, height=400, width=990,
                   title={
                          'text':"Distribution of Numerical Variables",
                          'y':0.90,
                          'x':0.5,
                          'xanchor':'center',
                          'yanchor':'top',
                          'font': {'size':28, 'color':'black', 'family':'Calibri'}
                         })

# Show the plot
fig.show()


In [None]:
# sns.pairplot(df)

<a id="libraries"></a>
# <b><span style='color:#0bbeb3'> Correlations Understanding</span></b>

![](https://www.mathsisfun.com/data/images/correlation-examples.svg)

<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>📊Categeorical columns vs Target Columns</b></font>
</div>


In [None]:
x_cols = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
y_col = 'stroke'

# Create subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(15, 10))

for i, col in enumerate(x_cols):
    # Create a cross tabulation showing the proportion of the target variable for each category of the feature
    cross_tab = pd.crosstab(index=df[col], columns=df[y_col])
    cross_tab_prop = cross_tab.div(cross_tab.sum(1).astype(float), axis=0)  # Normalize the data

    # Define colormap
    cmp = ListedColormap(['#346c68', '#5bbeb7'])

    # Plot stacked bar charts
    x, y = i // 4, i % 4
    cross_tab_prop.plot(kind='bar', stacked=True, width=0.8, colormap=cmp,
                        legend=False, ylabel='Proportion', ax=axes[x, y])

    # Add labels and counts to the bars
    for idx, val in enumerate(cross_tab_prop.index.values):
        for proportion, count, y_location in zip(cross_tab_prop.loc[val], cross_tab.loc[val], cross_tab_prop.loc[val].cumsum()):
            axes[x, y].text(x=idx - 0.3, y=(y_location - proportion) + (proportion / 2) - 0.03,
                             s=f'    {count}\n({np.round(proportion * 100, 1)}%)',
                             color="black", fontsize=9, fontweight="bold")

    # Add legend
    axes[x, y].legend(title=y_col.capitalize(), loc=(0.7, 0.9), fontsize=8, ncol=2)

    # Set y limit
    axes[x, y].set_ylim([0, 1.12])

plt.tight_layout()
plt.show()


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b> 🕵️‍♀️ Detect Outliers</b></font>
</div>


In [None]:
numeric_variables = ['age', 'bmi', 'avg_glucose_level']

# Set the size of the plot grid
plt.figure(figsize=(12, 6))

# Create Box Plots for Outlier Detection
for i, numeric_var in enumerate(numeric_variables, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(y=df[numeric_var], color='purple')
    plt.title(f'Box Plot of {numeric_var}')
    plt.ylabel(numeric_var)

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()


In [None]:
columns_to_check = ['bmi', 'avg_glucose_level']

# Loop through each column and remove outliers using IQR method
for column in columns_to_check:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # Get boolean array indicating the presence of outliers for the current column
    outliers = (df[column] < lower_bound) | (df[column] > upper_bound)

    # Remove rows containing outliers for the current column
    df = df[~outliers]

# Verify the changes
print("Shape after removing outliers:", df.shape)
numeric_variables = ['age', 'bmi', 'avg_glucose_level']

# Set the size of the plot grid
plt.figure(figsize=(12, 6))

# Create Box Plots for Outlier Detection
for i, numeric_var in enumerate(numeric_variables, 1):
    plt.subplot(2, 3, i)
    sns.boxplot(y=df[numeric_var], color='purple')
    plt.title(f'Box Plot of {numeric_var}')
    plt.ylabel(numeric_var)

# Adjust layout
plt.tight_layout()

# Show the plots
plt.show()


#### There is still a problem here. The minimum value for age is 0. We will drop this row

In [None]:
df.describe()

In [None]:
df = df[df['age'] != 0.000000]
print(df.shape)

In [None]:
df.describe()

In [None]:
df=df.drop(columns=["id"])

In [None]:
df[['hypertension', 'heart_disease', 'stroke']] = df[['hypertension', 'heart_disease', 'stroke']].astype(int)


<div style='background-color: #956789; border: 2px solid '>
    <font size="+2" color="white" ><b>OneHotEncoding</b></font>
</div>


In [None]:
df = pd.get_dummies(df, columns=['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status'], drop_first=True)
print(df.shape)


In [None]:
# Step 1 -> train/test/split
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['stroke']),
                                                 df['stroke'],
                                                 test_size=0.2,
                                                random_state=42)

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)

    return accuracy,precision

In [None]:
X_train.info()

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():

    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)

    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)

    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df