In [None]:
############################################################
# Setup Code (Run this cell before doing the task below)   #
############################################################
from pandas.testing import assert_frame_equal, assert_series_equal
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from IPython.display import display
import warnings
warnings.filterwarnings

# A DataFrame with the original data.
DATA_0 = pd.read_csv("/mnt/work/workbench/dipendrp/data/TilNTNU.csv", sep=";")

def get_renamed_data() -> pd.DataFrame:
    data = DATA_0.copy(deep=True)
    renamed = data.rename(
        columns={
            "Løpenummer": "id",
            "Kjønn": "gender",
            "Diagnosekode": "diagnosis_code",
            "Diagnosetekst": "diagnosis_text",
            "Alder_førstekontakt_dager": "age_first_contact_days",
            "Alder_sistekontakt_dager": "age_last_contact_days",
            "Alder_diagnose_dager": "age_diagnosis_days",
            "Alder_død_dager": "age_dead_days",
            "Alder_diagnose_aar": "age_diagnosis_year",
            "Alder_aar_førstekontakt": "age_first_contact_year",
            "Alder_aar_sistekontakt": "age_last_contact_year",
            "Alder_aar_død": "age_dead_year",
        })
    return renamed

DATA_1 = get_renamed_data()


def is_empty_str(text: str) -> bool:
    if not isinstance(text, str):
        return False
    return text.strip() == ""


def get_na_converted_data() -> pd.DataFrame:
    na_converted_df = DATA_1.copy(deep=True)
    for col in DATA_1.columns:
        rows_with_na = DATA_1.loc[:, col].apply(is_empty_str)
        if sum(rows_with_na):
            na_converted_df.loc[rows_with_na, col] = np.nan
    return na_converted_df

DATA_2 = get_na_converted_data()


def get_formatted_data() -> pd.DataFrame:
    formatted_df = DATA_2.copy(deep=True)
    # Gender
    formatted_df["gender"] = formatted_df["gender"].replace(
        "M", "male")
    formatted_df["gender"] = formatted_df["gender"].replace(
        "K", "female")
    formatted_df["gender"] = formatted_df["gender"].astype(
        "category")
    # Diagnosis Code and Diagnosis Text
    for col in ("diagnosis_code", "diagnosis_text"):
        formatted_df[col] = formatted_df[col].astype("string")

    # From column four to the end, which all contain integers
    for col in formatted_df.iloc[:, 4:].columns:
        formatted_df[col] = formatted_df[col].astype("Float32").astype("Int32")

    return formatted_df

DATA_3 = get_formatted_data()
DATA_4 = DATA_3.dropna(axis=0)

dummpy_DATA = pd.DataFrame({
    "happiness_level": ["Happy", "Happy", "Neutral", "Unhappy", "Unhappy"],
    "occupation": ["Engineer", "Engineer", "Student", "Nurse", "Student"],
    "number_of_childrens": [2, 0, 1, 3, 4]
})

print("Setup complete.")

<hr>
<hr>

**Note: Before doing the tasks below run the above setup cell script**

<hr>
<hr>

### Task 1 Solution **Basic Python**

Write a script to print the number of missing value in `diagnosis_code` column of `DATA_1`?. 


In [None]:
# TODO: Your code goes here.
DATA_1[['diagnosis_code']].isna().sum()

### Task 2 Solution **Understanding the DataFrame and Dataset**

Question: Describe the DataFrame by performing the following task 
1. View the first 5 rows of the dataframe `DATA_4`
2. Find the number of rows , column and names of the columns in the dataframe `DATA_4`
3. Find out categorical and contineous columns in the dataframe `DATA_4`
4. Compute mean, median and mode of contineous column `age_last_contact_year` in the dataframe `DATA_4`
5. Compute the frequency distribution of `gender` column in the dataframe `DATA_4`

In [None]:
# TODO: Your code goes here.

# 1. View the first 5 rows of the dataframe `DATA_4`
display(DATA_4.head(4))

# 2. How to find the number of rows , column and names of the columns in the dataframe `DATA_4`
print(f"\n Rows:{len(DATA_4.index)}, column:{len(DATA_4.columns)}, Column names:{DATA_4.columns}")

# 3. How will you find out categorical and contineous columns in the dataframe `DATA_4`
print(DATA_4.dtypes)

# 4. Compute mean, median and mode of contineous column `age_last_contact_year` in the dataframe `DATA_4`
print(f"\n Mean:{DATA_4[['age_last_contact_year']].mean()}, Median: {DATA_4[['age_last_contact_year']].median()}, Mode: {DATA_4[['age_last_contact_year']].mode()}")

# 5. Compute the frequency distribution of `gender` column in the dataframe `DATA_4`
print(f"\n {DATA_4[['gender']].value_counts()}")

### Task 3 Solution **Understanding the DataFrame**

Question: Find the total count of unique ICD codes from the `diagnosis_code` column in `DATA_4` dataframe.

In [None]:
# TODO: Your code goes here.
print(len(DATA_4['diagnosis_code'].unique()))

### Task 4 Solution **Encoding DataFrame**

Question: Apply One Hot Encoding on the `dummpy_DATA` dataframe and display it after one hot encoding. Original `dummpy_DATA` is shown below
Also analyze the difference before applying one hot and after, by yourself

In [None]:
# TODO: Your code goes here.

class OneHotEncoding:
    def __init__(self,dummpy_DATA):
        self.DATA_4_Gender_DiagnosisCode = dummpy_DATA
        
    def displayOriginal(self):
        display(dummpy_DATA)
    
    def displayOnehotencoded(self):
        encoded_dummpy_DATA = pd.get_dummies(dummpy_DATA,dtype=int)
        display(encoded_dummpy_DATA)
    
obj_OneHotEncoding = OneHotEncoding(dummpy_DATA)
print("Before One Hot Encoding:")
obj_OneHotEncoding.displayOriginal()
print("\n Aftre One Hot Encoding:")
obj_OneHotEncoding.displayOnehotencoded()

In [None]:
# Task 2 Answer
original_task3_df = pd.DataFrame({'number_of_childrens':[2,0,1,3,4],
                                  'happiness_level_Happy':[1,1,0,0,0],
                                  'happiness_level_Neutral':[0,0,1,0,0],
                                  'happiness_level_Unhappy':[0,0,0,1,1],
                                  'occupation_Engineer':[1,1,0,0,0],
                                  'occupation_Nurse':[0,0,0,1,0],
                                  'occupation_Nurse':[0,0,1,0,1]
                                 })

original_task3_df

## Task 5 Solution **Encoding DataFrame**


Question: Apply Label encoding on the `dummpy_DATA` dataframe and display it after Label encoding. Original `dummpy_DATA` is shown below
Also analyze the difference before applying Label encoding and after, by yourself

In [None]:
# TODO: Your code goes here.

class LabelEncoding:
    def __init__(self,dummpy_DATA):
        self.dummpy_DATA = dummpy_DATA
        
    def displayOriginal(self):
        display(dummpy_DATA)
    
    def displayLabelencoded(self):
        # Instantiate a LabelEncoder object
        le = LabelEncoder()
        # Loop through the categorical columns and apply label encoding
        for col in dummpy_DATA.columns:
            dummpy_DATA[col] = le.fit_transform(dummpy_DATA[col])
        # Display the encoded DataFrame
        display(dummpy_DATA)
    
obj_LabelEncoding = LabelEncoding(dummpy_DATA)
obj_LabelEncoding.displayOriginal()
obj_LabelEncoding.displayLabelencoded()


### Task 6 Solution **Understanding the DataFrame**
Find the total number of rows with the `diagnosis_code` `C06` which is `Ondartet svulst i andre og uspesifiserte deler av munn (neoplasma malignum partis alterius et non specificatae oris)` in `DATA_4` DataFrame.

In [None]:
# TODO: Your code goes here.
print(len(DATA_4[DATA_4['diagnosis_code'].str.startswith('C06')]))

### Task 7 Solution **Understanding the DataFrame**
List all the unique `id` of patients with the `diagnosis_code` `F067` which is ICD code for disease `Lett organisk kognitiv lidelse`, in `DATA_4` DataFrame. And display them.

In [None]:
C06_df = DATA_4[DATA_4['diagnosis_code'].str.startswith('F067')]
print(C06_df['id'].unique())

### Task 8 Solution **Plot and visualization**
Fill the missing code to, create a scatter plot using matplotlib to show `age_first_contact_year` vs `age_last_contact_year` for the data in the `DATA_4` DataFrame corresponding to 2 ICD codes or `diagnosis_code` `F102` and `F011`. In this plot, each point will be colored based on the ICD code or diagnosis_code (i.e., `F102` is `red` and `F011` is `green`).

Tips: Read the comment also for reference

In [None]:
# TODO: Your code goes here.
# Select data based on the 2 ICD codes
selected_df = DATA_4.loc[DATA_4["diagnosis_code"].isin(["F102", "F011"])]

# Create a scatter plot with different colors based on the ICD code
colors = {'F102': 'red', 'F011': 'green'}

#Create a scatter plot, by filling in df_icd
fig, ax = plt.subplots()
for icd_code, color in colors.items():
    #df_icd should be the dataframe containing the ICD code or `diagnosis_code` in colors set above
    df_icd = selected_df[selected_df["diagnosis_code"] == icd_code]
    ax.scatter(df_icd["age_first_contact_year"], df_icd["age_last_contact_year"], c=color, label=icd_code)

# Assign legend and labels
ax.legend()
plt.xlabel("Age first contact year")
plt.ylabel("Age last contact year")

# Display the plot
plt.show()


### Task 9 **Plot and visualization**

Instruction for doing `Task 9` question : 

In the code and output below we have idetified the most frequent patient `id` in `DATA_4` then displayed what disease have been that patient diagnosed with using `diagnosis_code` column of dataframe. 
And finally plott each `diagnosis_code` with respect to `age_first_contact_days	age_last_contact_days	age_diagnosis_days	age_dead_days` in a line plot to understand what what disease the patient was diagnosed earlier and what were diagnosed later, and also his overall journey from `age_first_contact_days` to till `age_dead_days`.

In [None]:
print(DATA_4['id'].value_counts())
most_frequent_id_df = DATA_4[DATA_4['id'] == 78]
print(f"\n Diagnosed Disease in Patient 78 : {most_frequent_id_df['diagnosis_code'].unique()} ")

most_frequent_id_df.plot(kind='line', x='diagnosis_code', y=['age_first_contact_days','age_diagnosis_days','age_last_contact_days','age_dead_days'], figsize=(15,8))
plt.xlabel("Diagnosed Diseases")
plt.ylabel("Days")
plt.title("\n Plot showing patient 78 journey from age_first_contact_days to till age_dead_days (Also more inference can be generated from varying age_diagnosis_days)")
plt.grid()
plt.show()