## 1. Load the dataset and display the first 5 rows.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('health_dataset.csv')
df.head()

## 2. Display the summary statistics of the dataset.

In [None]:
df.describe()

## 3. Check for missing values in the dataset.

In [None]:
df.isnull().sum()

## 4. Drop rows with any missing values.

In [None]:
df.dropna(inplace=True)
df.head()

## 5. Fill missing values with the mean for numerical columns.

In [None]:
df.fillna(df.mean(), inplace=True)
df.head()

## 6. Fill missing values with the mode for categorical columns.

In [None]:
df.fillna(df.mode().iloc[0], inplace=True)
df.head()

## 7. Create a new column 'Age_Group' based on 'Age' using bins.

In [None]:
bins = [0, 18, 35, 50, 65, 100]
labels = ['Child', 'Young_Adult', 'Adult', 'Senior_Adult', 'Elder']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
df.head()

## 8. Calculate the correlation matrix for numerical columns.

In [None]:
df.corr()

## 9. Plot a histogram for the 'Blood_Pressure' column.

In [None]:
df['Blood_Pressure'].hist(bins=20)

## 10. Plot a bar chart for the 'Diagnosis' column.

In [None]:
df['Diagnosis'].value_counts().plot(kind='bar')

## 11. Plot a scatter plot between 'Blood_Pressure' and 'Cholesterol_Level'.

In [None]:
df.plot.scatter(x='Blood_Pressure', y='Cholesterol_Level')

## 12. Encode the 'Treatment' column using one-hot encoding.

In [None]:
df = pd.get_dummies(df, columns=['Treatment'])
df.head()

## 13. Normalize the 'Cholesterol_Level' column using Min-Max scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Cholesterol_Level'] = scaler.fit_transform(df[['Cholesterol_Level']])
df.head()

## 14. Standardize the 'Blood_Pressure' column.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['Blood_Pressure'] = scaler.fit_transform(df[['Blood_Pressure']])
df.head()

## 15. Create a pivot table showing the average 'Heart_Rate' for each 'Diagnosis'.

In [None]:
df.pivot_table(values='Heart_Rate', index='Diagnosis', aggfunc='mean')

## 16. Group the data by 'Medication' and calculate the mean 'Blood_Pressure'.

In [None]:
df.groupby('Medication')['Blood_Pressure'].mean()

## 17. Filter the dataset for patients with 'Cholesterol_Level' greater than 200.

In [None]:
df_filtered = df[df['Cholesterol_Level'] > 200]
df_filtered.head()

## 18. Sort the dataset by 'Visit_Date' in descending order.

In [None]:
df.sort_values(by='Visit_Date', ascending=False).head()

## 19. Create a new column 'Risk_Factor' based on 'Blood_Pressure' and 'Cholesterol_Level'.

In [None]:
df['Risk_Factor'] = df['Blood_Pressure'] + df['Cholesterol_Level']
df.head()

## 20. Replace all instances of 'No' in the 'Follow_Up' column with 'Pending'.

In [None]:
df['Follow_Up'].replace('No', 'Pending', inplace=True)
df.head()

## 21. Rename the column 'Cholesterol_Level' to 'Cholesterol'.

In [None]:
df.rename(columns={'Cholesterol_Level': 'Cholesterol'}, inplace=True)
df.head()

## 22. Drop the 'Follow_Up' column from the dataset.

In [None]:
df.drop(columns=['Follow_Up'], inplace=True)
df.head()

## 23. Check for duplicate rows in the dataset.

In [None]:
df.duplicated().sum()

## 24. Drop any duplicate rows found.

In [None]:
df.drop_duplicates(inplace=True)
df.head()

## 25. Create a new DataFrame containing only 'Patient_ID', 'Age', and 'Diagnosis'.

In [None]:
df_subset = df[['Patient_ID', 'Age', 'Diagnosis']]
df_subset.head()

## 26. Merge the new DataFrame with the original dataset on 'Patient_ID'.

In [None]:
df_merged = pd.merge(df_subset, df, on='Patient_ID')
df_merged.head()

## 27. Create a cross-tabulation of 'Diagnosis' and 'Medication'.

In [None]:
pd.crosstab(df['Diagnosis'], df['Medication'])

## 28. Create a pivot table showing the count of patients by 'Diagnosis' and 'Hospital_Name'.

In [None]:
df.pivot_table(index='Diagnosis', columns='Hospital_Name', aggfunc='size', fill_value=0)

## 29. Replace missing values in 'Heart_Rate' with the median.

In [None]:
df['Heart_Rate'].fillna(df['Heart_Rate'].median(), inplace=True)
df.head()

## 30. Calculate the Z-score for the 'Blood_Pressure' column.

In [None]:
from scipy.stats import zscore
df['Blood_Pressure_Zscore'] = zscore(df['Blood_Pressure'])
df.head()

## 31. Filter out outliers in the 'Blood_Pressure' column based on Z-score.

In [None]:
df_no_outliers = df[(df['Blood_Pressure_Zscore'] > -3) & (df['Blood_Pressure_Zscore'] < 3)]
df_no_outliers.head()

## 32. Apply a lambda function to the 'Cholesterol' column to categorize into 'Low', 'Medium', 'High'.

In [None]:
df['Cholesterol_Level_Category'] = df['Cholesterol'].apply(lambda x: 'Low' if x < 150 else 'Medium' if x < 250 else 'High')
df.head()

## 33. Create a new column 'Overall_Health_Score' by summing 'Blood_Pressure', 'Heart_Rate', and 'Cholesterol'.

In [None]:
df['Overall_Health_Score'] = df['Blood_Pressure'] + df['Heart_Rate'] + df['Cholesterol']
df.head()

## 34. Extract the month from 'Visit_Date' and create a new column 'Visit_Month'.

In [None]:
df['Visit_Date'] = df['Visit_Date'].astype('datetime64')
df['Visit_Month'] = df['Visit_Date'].dt.month
df.head()

## 35. Convert the 'Diagnosis' column to a categorical data type.

In [None]:
df['Diagnosis'] = df['Diagnosis'].astype('category')
df.dtypes

## 36. Create a box plot for 'Cholesterol_Level' across different 'Diagnosis'.

In [None]:
df.boxplot(column='Cholesterol', by='Diagnosis')

## 37. Create a line plot showing the trend of 'Blood_Pressure' over 'Visit_Date'.

In [None]:
df.groupby('Visit_Date')['Blood_Pressure'].mean().plot()

## 38. Create a heatmap for the correlation matrix.

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), annot=True)

## 39. Filter the dataset to include only patients with 'Diagnosis' as 'Diabetes'.

In [None]:
df_diabetes = df[df['Diagnosis'] == 'Diabetes']
df_diabetes.head()

## 40. Replace outliers in 'Cholesterol' with the median value.

In [None]:
median_value = df['Cholesterol'].median()
df.loc[df['Cholesterol'] > df['Cholesterol'].quantile(0.99), 'Cholesterol'] = median_value
df.head()

## 41. Calculate the percentage of missing values in each column.

In [None]:
missing_percent = df.isnull().mean() * 100
missing_percent

## 42. Reorder the columns so that 'Patient_ID' is the first column.

In [None]:
df = df[['Patient_ID'] + [col for col in df.columns if col != 'Patient_ID']]
df.head()

## 43. Create a column 'High_Risk_Patient' which is True if 'Overall_Health_Score' > 300, else False.

In [None]:
df['High_Risk_Patient'] = df['Overall_Health_Score'] > 300
df.head()

## 44. Split the dataset into training (80%) and testing (20%) sets based on 'Patient_ID'.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape, test.shape

## 45. Create a pipeline to preprocess the 'Blood_Pressure' and 'Cholesterol' columns.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler())
])
df[['Blood_Pressure', 'Cholesterol']] = pipeline.fit_transform(df[['Blood_Pressure', 'Cholesterol']])
df.head()

## 46. Export the cleaned dataset to a new CSV file.

In [None]:
df.to_csv('cleaned_health_dataset.csv', index=False)

## 47. Save the dataset in Excel format with multiple sheets based on 'Diagnosis'.

In [None]:
with pd.ExcelWriter('health_by_diagnosis.xlsx') as writer:
    for diagnosis in df['Diagnosis'].unique():
        df[df['Diagnosis'] == diagnosis].to_excel(writer, sheet_name=diagnosis, index=False)

## 48. Create a summary report of the dataset including key statistics and visualizations.

In [None]:
df.describe(include='all')

## 49. Write a function to automate the data cleaning process for this dataset.

In [None]:
def clean_data(df):
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.fillna(df.mean(), inplace=True)
    return df

cleaned_df = clean_data(df.copy())
cleaned_df.head()

## 50. Create a new column 'Treatment_Effectiveness_Score' using a simple formula on 'Medication' and 'Overall_Health_Score'.

In [None]:
df['Treatment_Effectiveness_Score'] = df['Medication'] * df['Overall_Health_Score'].astype(int)
df.head()