In [24]:
import requests  
import pandas as pd
import matplotlib.pyplot as plt

 # Patient Exploratory Analysis
 ## Attribute Descriptions:
 * ID (integer): Index of the dataset. Counting numbers starting at 0.  
 * Gender (string): Gender of the patient, "M" for male and "F" for female.  
 * Age (integer): Age of the patient in years  
 * Marital Status (string): Description of the patient's marital status. No coding system enforced.  
 * RIC (integer): RIC of the patient assigned according to Appendix B in the Centers for Medicaid and Medicare Services IRF-PAI training manual.  
 * Admission Total FIM Score: The admission total Functional Independence Measure (FIM) score of the patient.
    * The FIM is a clinical assessment used to measure patient functioning at inpatient rehabilitation hospitals. The FIM is measured at two distinct points in time: admission and discharge.  
    * The FIM measures the level of assistance required to perform 18 activities of daily living (ADL) tasks (e.g. eating, walking, problem-solving, etc.).  
    * The tasks are categorized as either motor (13 tasks) or cognitive (5 tasks). Each task is scored on a 7-point ordinal scale to measure independence as determined by the amount of assistance required to perform each ADL task.  
    * For more information about the FIM, see Section III in the Centers for Medicaid and Medicare Services IRF-PAI training manual.  
    * Discharge Total FIM Score: The discharge total FIM score of the patient.


In [25]:
def clean_marital_column(substring, revised_entry):
    '''this function cleans the Marital Status column from free response to the strict coding system'''
    for x in range(len(patient_data_to_clean_df)):
        if(substring in patient_data_to_clean_df.at[x, "Marital Status"]):
            patient_data_to_clean_df.at[x, "Marital Status"] = revised_entry

In [26]:
def assigning_NaN():
    '''this function takes the remaining values in the Marital Status column and changes them to "NaN" '''
    for x in range(len(patient_data_to_clean_df)):
        if (patient_data_to_clean_df.at[x, "Marital Status"] != "Divorced" and patient_data_to_clean_df.at[x, "Marital Status"] != "Never Married" and patient_data_to_clean_df.at[x, "Marital Status"] != "Married" and patient_data_to_clean_df.at[x, "Marital Status"] != "Widowed" and patient_data_to_clean_df.at[x, "Marital Status"] != "Separated"):
            patient_data_to_clean_df.at[x, "Marital Status"] = "NaN"
  

In [27]:
def decoding_RIC():
    '''this function changes all of the keys to their respective value in the ric_decoder dictionary'''
    patient_data_to_clean_df["RIC"] = patient_data_to_clean_df["RIC"].astype(str) #converts the integers from the DF to string
    for x in range(len(patient_data_to_clean_df)):
        for i in range(21):
            j = str(i)
            if(patient_data_to_clean_df.at[x,"RIC"] == j):
                value1 = patient_data_to_clean_df.at[x,"RIC"]
                patient_data_to_clean_df.at[x,"RIC"] = ric_decoder[int(value1)]

In [28]:
def gathering_total(column_name, search_value):
    '''this function gathers the total value of a specific value in a user entered column'''
    total_count = 0
    for x in range(len(patient_data_to_clean_df)):
        if (patient_data_to_clean_df.at[x, column_name] == search_value):
            total_count += 1
    return total_count

In [29]:
# def generating_histogram(x, y, filename):
    

In [31]:
#declaration block
ric_decoder = {1: "Stroke", 2: "TBI", 3: "NTBI", 4: "TSCI", 5: "NTSCI", 6: "Neuro", 7: "FracLE", 8: "ReplLE", 9: "Ortho", 10: "AMPLE", 11: "AMP-NLE", 12: "OsteoA", 13: "RheumA", 14: "Cardiac", 15: "Pulmonary", 16: "Pain", 17: "MMT-NBSCI", 18: "MMT-BSCI", 19: "GB", 20: "Misc", 21: "Burns"}



In [32]:
patient_data_to_clean_infile = pd.read_csv("patient_data_to_clean.csv", index_col="ID")
patient_data_to_clean_df = pd.DataFrame(patient_data_to_clean_infile)


clean_marital_column("WID", "Widowed")
clean_marital_column("wid", "Widowed")
clean_marital_column("Wid", "Widowed")

clean_marital_column("DIV", "Divorced")
clean_marital_column("Div", "Divorced")
clean_marital_column("div", "Divorced")

clean_marital_column("SEP", "Seperated")
clean_marital_column("Sep", "Seperated")
clean_marital_column("sep", "Seperated")

clean_marital_column("Sin", "Single")
clean_marital_column("SIN", "Single")
clean_marital_column("sin", "Single")
#nev -> Single
clean_marital_column("NEV", "Single")
clean_marital_column("Nev", "Single")
clean_marital_column("nev", "Single")

clean_marital_column("mar", "Married")
clean_marital_column("Mar", "Married")
clean_marital_column("MAR", "Married")

clean_marital_column("Sin", "Never Married")

assigning_NaN()
decoding_RIC()


patients_total = len(patient_data_to_clean_df)
females_total = gathering_total("Gender", "F")
males_total = gathering_total("Gender", "M")
married_total = gathering_total("Marital Status", "Married")
most_common_RIC = patient_data_to_clean_df["RIC"].value_counts().idxmax()
most_common_RIC_total = gathering_total("RIC", most_common_RIC)



stroke_data = []
stroke_male_age = []
stroke_female_age = []

for i in range(len(patient_data_to_clean_df)):
    if (patient_data_to_clean_df.at[i, "RIC"] == most_common_RIC):
        stroke_data.append(patient_data_to_clean_df.at[i, "Age"])
        if (patient_data_to_clean_df.at[i, "Gender"] == 'M'):
            stroke_male_age.append(patient_data_to_clean_df.at[i, "Age"])
        else:
            stroke_female_age.append(patient_data_to_clean_df.at[i, "Age"])

#doing the calcuations
stroke_data = pd.Series(stroke_data)
stroke_age_avg = stroke_data.mean() #7. 
stroke_age_std = stroke_data.std() #8.

stroke_male_age = pd.Series(stroke_male_age)
stroke_female_age = pd.Series(stroke_female_age)

stroke_age_male_avg = stroke_male_age.mean() #9-12
stroke_age_male_std = stroke_male_age.std()
stroke_age_female_avg = stroke_female_age.mean()
stroke_age_female_std = stroke_female_age.std()

patient_data_stats = [patients_total, females_total, males_total, married_total, most_common_RIC, most_common_RIC_total, stroke_age_avg, stroke_age_std, stroke_age_male_avg, stroke_age_male_std, stroke_age_female_avg, stroke_age_female_std]
patient_data_series = pd.Series(patient_data_stats)
print(patient_data_series)


patient_data_to_clean_df.to_csv("REVISED_PATIENT_INFO.csv")

0          4555
1          2313
2          2242
3          2261
4        Stroke
5          1169
6     71.292558
7     14.340883
8      70.07743
9     13.841474
10    72.604982
11    14.761702
dtype: object


### Series Observation Notes  
Of the 4555 total patients, with the slight majority ofthem being male. Of all the total patients, about a third of them had a stroke or stroke related issue. The average of having a stroke was higher for a female was higher than the average age that a man had had a stroke. Statistically speaking, this means that the women were generally healthier than the men because the stroke came a later age.