# Summary Table & Finding Codes EDA

## Postgres Connections and Functions

In [1]:
import sqlalchemy
import numpy as np
import pandas as pd
import psycopg2
import seaborn as sns
import yaml

import matplotlib.pyplot as plt
%matplotlib inline

ModuleNotFoundError: No module named 'yaml'

In [None]:
#Read in config file
with open("../conf/local/db.yaml") as f:
    conf=yaml.safe_load(f)

In [None]:
# set up sqlalchemy engine
host = conf['host']
DB = conf['DB']
user = conf['user']
pw = conf['pw']

connection_string = "postgresql://{}:{}@{}/{}".format(user, pw, host, DB)
conn = sqlalchemy.create_engine(connection_string)

In [None]:
# Get Summary Datatable
query = """
select * 
from DM_Spain_VIEW_study_summary;
"""

sum_table = pd.read_sql(query, conn)
sum_table.head()

## Table Cleanup

In [None]:
sum_table_clean = sum_table.copy()

## Number of records in the database

In [None]:
[number_of_rows, number_of_columns] = sum_table_clean.shape
print("number of rows")
print(number_of_rows)
print("number of columns")
print(number_of_columns)
print('number of unique studies')
print(sum_table_clean.studyidk.nunique())

### Blank fields in Database

In [None]:
sum_table_clean.isnull().sum()

## Gender

USAL has confirmed:
- "O" stands for "Other"
- "U" stands for "Unknown"
- All blanks can be changed to "U" for unknown 

In [None]:
sum_table_clean['gender'].value_counts()

In [None]:
sum_table_clean['gender'] = sum_table_clean['gender'].replace('', 'U')
print(sum_table_clean['gender'].value_counts())

## Age

In [None]:
print('Number of cells in the age column that are '' (blank)')
print(sum_table_clean['age'][sum_table_clean['age']==''].count())

print('Number of cells in the age column that are np.nan')
print(sum_table_clean['age'][sum_table_clean['age']==np.nan].count())


In [None]:
# Clean up cells above by replacing them with 1
sum_table_clean['age'] = sum_table_clean['age'].replace('',1)
#sum_table_clean['age'] = sum_table_clean['age'].replace(np.nan, 1)
sum_table_clean['age'] = sum_table_clean['age'].astype(int)

In [None]:
results = plt.boxplot(sum_table_clean['age'])
plt.title('Distribution of Age (pre clean up)')
plt.show()

In [None]:
#get_outlier_thresholds(results) Yoni


In [None]:
print('Highest 10 Ages in dataframe')
print(sum_table_clean['age'].nlargest(10))
print('Lowest 10 Ages in dataframe')
print(sum_table_clean['age'].nsmallest(10))

In [None]:
# where age is more than 115, update value to '1'
sum_table_clean['age'] = sum_table_clean['age'].apply(lambda x: x if x < 115 else 1)


In [None]:
print('Ages in database between 1 and 20 (age & count)')
print(sum_table_clean['age'][(sum_table_clean['age'] > 1) & (sum_table_clean['age'] < 20)].value_counts())
print('Sum of above')
print(sum_table_clean['age'][(sum_table_clean['age'] > 1) & (sum_table_clean['age'] < 20)].count())

In [None]:
results = plt.boxplot(sum_table_clean['age'])
plt.title('Distribution of Age (pre clean up)')
plt.show()

In [None]:
print(sum_table_clean['age'].describe())
#sum_table_clean['age'].hist()
sns.distplot(sum_table_clean['age'])
plt.title('Distribution of Age: unknown values =1')
plt.show()

### Weight

In [None]:
print('Number of cells in the weight column that are '' (blank)')
print(sum_table_clean['patientweight'][sum_table_clean['patientweight']==''].count())

print('Number of cells in the weigh column that are np.nan')
print(sum_table_clean['patientweight'][sum_table_clean['patientweight']==np.nan].count())
print(sum_table_clean['patientweight'].isna().sum())

In [None]:
# Where value is blank, update value to '1'
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].replace('',1)
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].replace(np.nan, 1)

In [None]:
print('Number of cells in the weight column that contain comma')
print(sum_table_clean['patientweight'][sum_table_clean['patientweight'].str.contains(',', na=False)].count())

In [None]:
#Change commas to decimal points
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].str.replace(',','.')
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].astype(float)
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].fillna(1)
print(sum_table_clean['patientweight'].dtype)

In [None]:
plt.boxplot(sum_table_clean['patientweight'])
plt.title('Distribution of Weight (pre clean up)')
plt.show()

In [None]:
print("Two clear outliers have been replaced with 1")
# where weigh is more than 2000, update value to '1'
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].apply(lambda x: x if x < 2000 else 1)
plt.boxplot(sum_table_clean['patientweight'])
plt.show()

In [None]:
print('Number of studies above 200kg (to be corrected)')
print(sum_table_clean['patientweight'][sum_table_clean['patientweight'] > 200].count())
print('Number of studies below 25kg (to be corrected)')
print(sum_table_clean['patientweight'][(sum_table_clean['patientweight'] > 1) 
                                       & (sum_table_clean['patientweight'] < 25)].count())

In [None]:
print("0utliers above 200kg have been replaced with 1")
# where weigh is more than 200, update value to '1'
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].apply(lambda x: x if x < 200 else 1)
sum_table_clean['patientweight'] = sum_table_clean['patientweight'].apply(lambda x: x if x > 25 else 1)
plt.boxplot(sum_table_clean['patientweight'])
plt.show()

In [None]:
print(sum_table_clean['patientweight'].describe())
#sum_table_clean['patientweight'].hist()
sns.distplot(sum_table_clean['patientweight'])
plt.title('Distribution of Weight : unknown values = 1')
plt.show()

### Height

In [None]:
print('Number of cells in the weight column that are '' (blank)')
print(sum_table_clean['patientheight'][sum_table_clean['patientheight']==''].count())

print('Number of cells in the height column that are np.nan')
print(sum_table_clean['patientheight'][sum_table_clean['patientheight']==np.nan].count())
print(sum_table_clean['patientheight'].isna().sum())

In [None]:
# Where value is blank, update value to '1'
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].replace('',1)
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].replace(np.nan, 1)

In [None]:
print('Number of cells in the weight column that contain comma')
print(sum_table_clean['patientheight'][sum_table_clean['patientheight'].str.contains(',', na=False)].count())

In [None]:
#Change commas to decimal points
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].str.replace(',','.').astype(float)
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].fillna(1)
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].astype(float)

In [None]:
plt.boxplot(sum_table_clean['patientheight'])
plt.title('Distribution of Height (pre clean up)')
plt.show()

In [None]:
# where height is more than 300, or under 100, update value to '1'
sum_table_clean['patientheight'] = sum_table_clean['patientheight'].apply(lambda x: x if x < 300 else 1)
#sum_table_clean['patientheight'] = sum_table_clean['patientheight'].apply(lambda x: x if x > 100 else 1)
plt.boxplot(sum_table_clean['patientheight'])
plt.show()

In [None]:
print(sum_table_clean['patientheight'].describe())
#sum_table_clean['patientheight'].hist()
sns.distplot(sum_table_clean['patientheight'])
plt.title('Distribution of Height : unknown values = 1')
plt.show()

In [None]:
sum_table_clean.columns

## Study location

Unsure as to what this field intends to indicate.
98% of fields have been left blank.


In [None]:
print('Number of unique locations in database')
print(sum_table_clean['studylocation'].nunique())

print('Percentage of field left blank')
print(sum_table_clean['studylocation'][sum_table_clean['studylocation'] == ''].count()/number_of_rows)

sum_table_clean['studylocation'].value_counts()

## Machine

There are 34 unqiue machines listed in the database.  

Questions:
- should I rename the blank ones as 'unknown'
- why are there machines with less than 20 or 30 records associated with them?  Are these mistakes/duplicates? 
- does machine type give any indication of the time period when the scan was taken, or the location?

In [None]:
print('Number of unique machine types in database')
print(sum_table_clean['equipment'].nunique())
sum_table_clean['equipment'].value_counts()

## Create Features

### BMI

In [None]:
# Create BMI feature
#https://www.cdc.gov/nccdphp/dnpao/growthcharts/training/bmiage/page5_1.html
# [weight (kg) / height (cm) / height (cm)] x 10,000
def calc_bmi(weight_kg, height_cm):
    return ((weight_kg/height_cm/height_cm)*10000)

In [None]:
# Create BMI Columns
sum_table_clean['BMI'] = sum_table_clean.apply(lambda x: calc_bmi(x.patientweight, x.patientheight), axis=1)

In [None]:
# where BMI is more than 45, update value to '1'
sum_table_clean['BMI'] = sum_table_clean['BMI'].apply(lambda x: x if x < 45 else 1)
# where BMI is less than 15, update value to '1'
sum_table_clean['BMI'] = sum_table_clean['BMI'].apply(lambda x: x if x > 15 else 1)

In [None]:
plt.boxplot(sum_table_clean['BMI'])
plt.show()

In [None]:
print(sum_table_clean['BMI'].describe())
#sum_table_clean['BMI'].hist()
sns.distplot(sum_table_clean['BMI'])
plt.title('Distribution of BMI: unknown values = 1')
plt.show()

## Finding Codes

In [None]:
print()

In [None]:
#codes for each of the pathologies
pathologies_of_interest = {'LV-0068', 'LV-0144', 'LV-0069', 'LV-0070', 'LV-0065', 'LV-0062'
                    ,'LV-0061', 'LV-0080','LV-0077','LV-0078'
                    , 'LV-0061', 'LA-0016', 'LA-0013', 'SU-0032'}

# Pathology dictonary
pathdict = {"LV-0062": "HC_F"
            ,"SU-0032": "Norm"
            ,"LV-0078": "RLVEF_F"
            ,"LV-0077": "RLVEF_F"
            ,"LV-0061": "RLVEF_F"
            ,"LV-0080": "RLVEF_T"
            ,"LV-0068": "HC_T"
            ,"LA-0016": "DLA_T"
            ,"LV-0065": "HC_F"
            ,"LV-0144": "HC_T"
            ,"LV-0070": "HC_C"
            ,"LA-0013": "DLA_F"
            ,"LV-0069": "HC_C"
           }

# Pathology codes
HC_T_codes = {'LV-0144', 'LV-0068'}
HC_F_codes = {'LV-0062', 'LV-0065', 'LV-0061'}
HC_C_codes = {'LV-0069', 'LV-0070'}

RLVEF_T_codes = {'LV-0080'}
RLVEF_F_codes = {'LV-0061', 'LV-0077', 'LV-0078'}

DLA_T_codes = {'LA-0016'}
DLA_F_codes = {'LA-0013'}

Norm_codes = {'SU-0032'}

In [None]:
def create_pathology_column(study_table, path_codes, path_name):
    #puts the code of interest in a new column
    study_table[path_name] = study_table.findingcode.apply(lambda x: intersection(x, path_codes)) 
    #replace the codes with 0 or 1 (NOTE: could return higher than 1 error?  needs to be handled)
    study_table[path_name] = study_table[path_name].apply(lambda x : len(x))
    
    #Convert the code from a list to a string
    #study_table[path_name] = study_table[path_name].apply(lambda x : ''.join(x))
    # replace the codes with the pathology names
    #study_table = study_table.replace({path_name: pathdict})
    return study_table

In [None]:
def intersection(lst1, lst2): 
#https://www.geeksforgeeks.org/python-intersection-of-multiple-lists/      
    return [item for item in lst1 if item in lst2] 

In [None]:
# turn the findingcode column into a list
sum_table_clean.findingcode = sum_table_clean.findingcode.apply(lambda x: x.split(","))

In [None]:
sum_table_clean.head(1)

In [None]:
# Count instances 
db_finding_codes = sum_table_clean[['studyidk','findingcode']].copy()
temp = db_finding_codes.apply(lambda x: pd.Series(x['findingcode']),axis=1).stack().reset_index(level=1, drop=True)
temp.name = 'finding_code_del'
db_finding_codes = db_finding_codes.drop('findingcode', axis=1).join(temp)
db_finding_codes = db_finding_codes.reset_index(drop=True)
db_finding_codes.head()

In [None]:
# Count of the unique number of finding codes present in the DB
unique_finding_codes = db_finding_codes['finding_code_del'].nunique()
print('number of unique finding codes in the database')
print(unique_finding_codes)

In [None]:
# Quick histogram of the number of finding codes per study in the database
axes = db_finding_codes['studyidk'].value_counts().hist(bins=50)
plt.suptitle("Numbers of finding codes per study")
plt.show()

In [None]:
# adding pathology column
sum_table_clean = create_pathology_column(sum_table_clean, HC_T_codes, "HC_T")
sum_table_clean = create_pathology_column(sum_table_clean, HC_F_codes, "HC_F")
sum_table_clean = create_pathology_column(sum_table_clean, HC_C_codes, "HC_C")
sum_table_clean = create_pathology_column(sum_table_clean, RLVEF_T_codes, "RLVEF_T")
sum_table_clean = create_pathology_column(sum_table_clean, RLVEF_F_codes, "RLVEF_F")
sum_table_clean = create_pathology_column(sum_table_clean, DLA_T_codes, "DLA_T")
sum_table_clean = create_pathology_column(sum_table_clean, DLA_F_codes, "DLA_F")
sum_table_clean = create_pathology_column(sum_table_clean, Norm_codes, "Norm")

In [None]:
sum_table_clean.head()

## Histograms: Pathologies by Demographics

In [None]:
sum_table_clean['HC_T'].sum()

In [None]:
# Create a table of relevant findingcodes/pathologies

columns = ["Findingcode diagnosing disease",
           "Findingcode compatible with disease", 
           "Findingcode normal for this pathology"]
rows = ["Hypertrophic Cardiomyopathy",
        "Reduced Left Ventricle Ejection Fraction",
       "Dilated Left Atria",
       "Normal study"]
data=[[sum_table_clean['HC_T'].sum(), sum_table_clean['HC_C'].sum(), sum_table_clean['HC_F'].sum()],
     [sum_table_clean['RLVEF_T'].sum(), " ", sum_table_clean['RLVEF_F'].sum()],
     [sum_table_clean['DLA_T'].sum(), " ", sum_table_clean['DLA_F'].sum()],
     [sum_table_clean['Norm'].sum(), " ", " "]]

ResultsTable = pd.DataFrame(index=rows, columns=columns, data=data)

ResultsTable


## Pathologies present by age, weight, height and BMI

In [None]:
HC_T_Table = sum_table_clean[sum_table_clean['HC_T'] == 1]
RLVEF_T_Table = sum_table_clean[sum_table_clean['RLVEF_T'] == 1]
DLA_T_Table = sum_table_clean[sum_table_clean['DLA_T'] == 1]

# Age, weight, hieght, and BMI for HC_T
plt.figure(figsize=(18, 12))

plt.subplot(3, 4, 1)
HC_T_Table.age.hist()
plt.grid(b=None)
plt.title('Hypertropic Cardiomypathy (by Age)')
plt.xlabel('age (years)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 5)
RLVEF_T_Table.age.hist(color='red')
plt.grid(b=None)
plt.title('Left Ventricle Ejection Fraction (by Age)')
plt.xlabel('age (years)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 9)
DLA_T_Table.age.hist(color='green')
plt.grid(b=None)
plt.title('Dilated Left Ventricle (by Age)')
plt.xlabel('age (years)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 2)
HC_T_Table.patientweight.hist()
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Hypertropic Cardiomypathy (by Weight)')
plt.xlabel('Weight (kg)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 6)
RLVEF_T_Table.patientweight.hist(color='red')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Left Ventricle Ejection Fraction (by Weight)')
plt.xlabel('Weight (kg)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 10)
DLA_T_Table.patientweight.hist(color='green')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Dilated Left Ventricle (by Weight)')
plt.xlabel('Weight (kg)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 3)
HC_T_Table.patientheight.hist()
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Hypertropic Cardiomypathy (by Height)')
plt.xlabel('Height (cm)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 7)
RLVEF_T_Table.patientheight.hist(color='red')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Left Ventricle Ejection Fraction (by Height)')
plt.xlabel('Height (cm)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 11)
DLA_T_Table.patientheight.hist(color='green')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Dilated Left Ventricle (by Height)')
plt.xlabel('Height (cm)')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 4)
HC_T_Table.BMI.hist()
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Hypertropic Cardiomypathy (by BMI)')
plt.xlabel('BMI')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 8)
RLVEF_T_Table.BMI.hist(color='red')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Left Ventricle Ejection Fraction (by BMI)')
plt.xlabel('BMI')
plt.ylabel('Number of cases')

plt.subplot(3, 4, 12)
DLA_T_Table.BMI.hist(color='green')
#plt.yticks(range(0, 4500, 500), fontsize=14) 
plt.grid(b=None)
plt.title('Dilated Left Ventricle (by BMI)')
plt.xlabel('BMI')
plt.ylabel('Number of cases')

plt.tight_layout()
plt.show()

# Import information on instances and views

Import csv created by Yoni on breakdown of instances and views

In [None]:
tb_instances_views = pd.read_csv('../data/02_intermediate/frames_with_views.csv')
tb_instances_views.head()

In [None]:
print(tb_instances_views.shape)
print(tb_instances_views.dtypes)

In [None]:
sum_table_clean['studyidk'] = sum_table_clean.studyidk.astype(np.float64)
print(sum_table_clean.shape)
print(sum_table_clean.dtypes)

## Join tables

In [None]:
tb_instances_views_studies = pd.merge(tb_instances_views, sum_table_clean, on='studyidk', how='left')
print(tb_instances_views_studies.shape)

In [None]:
tb_instances_views_studies.head()

In [None]:
tb_instances_views_studies.columns

# EDA on views

In [None]:
tb_is_plax = tb_instances_views_studies[tb_instances_views_studies['is_plax'] == True]
print('Number of instances with parasternal axis view: {}'.format(tb_is_plax.shape[0]))
print('Number of unique studies with parasternal axis view: {}'.format(tb_is_plax.studyidk.nunique()))

tb_is_ac4 = tb_instances_views_studies[tb_instances_views_studies['is_a4c'] == True]
print('Number of instances with apical four chambers view: {}'.format(tb_is_ac4.shape[0]))
print('Number of unique studies with apical four chambers view: {}'.format(tb_is_ac4.studyidk.nunique()))

tb_is_ac2 = tb_instances_views_studies[tb_instances_views_studies['is_a2c'] == True]
print('Number of instances with apical two chambers view: {}'.format(tb_is_ac2.shape[0]))
print('Number of unique studies with apical two chambers view: {}'.format(tb_is_ac2.studyidk.nunique()))

In [None]:
data = {'view_name': ['PLAX', 'AC4', 'AC2'],
        'Numbers_of_studies': [tb_is_plax.studyidk.nunique(), tb_is_ac4.studyidk.nunique(), tb_is_ac2.studyidk.nunique()],
        'Numbers_of_instances': [tb_is_plax.shape[0], tb_is_ac4.shape[0],tb_is_ac2.shape[0]]}

study_view_numbers = pd.DataFrame(data, columns=['view_name', 'Numbers_of_studies', 'Numbers_of_instances'])
study_view_numbers.head()

In [None]:
plt.figure(figsize=(18, 5))

plt.subplot(1,2,1)
sns.barplot(x="view_name", y="Numbers_of_studies", data =study_view_numbers).set_title('Numbers of unique studies with examples of each view')
plt.ylim(5000, 40000)

plt.subplot(1,2,2)
sns.barplot(x="view_name", y="Numbers_of_instances", data =study_view_numbers).set_title('Numbers of instances with examples of each view')
plt.ylim(5000, 40000)
plt.show()

In [None]:
plt.figure(figsize=(18, 5))

plt.subplot(1,3,1)
sns.barplot(x="gender", y="studyidk", data=tb_is_plax).set_title('Parasternal Axis View: Gender by instance')
plt.ylim(2000, 100000)

plt.subplot(1,3,2)
sns.barplot(x="gender", y="studyidk", data=tb_is_ac4).set_title('Apical Four Chambers View: Gender by instance')
plt.ylim(2000, 100000)

plt.subplot(1,3,3)
sns.barplot(x="gender", y="studyidk", data=tb_is_ac2).set_title('Apical Two Chambers View: Gender by instance')
plt.ylim(2000, 100000)
plt.show()

In [None]:
plt.figure(figsize=(18, 5))

plt.subplot(1,3,1)
plt.hist(tb_is_plax.age)
plt.title('Parasternal Axis View: Age by instance')
plt.ylim(0, 12000)

plt.subplot(1,3,2)
plt.hist(tb_is_ac4.age)
plt.title('Apical Four Chambers View: Age by instance')
plt.ylim(0, 12000)

plt.subplot(1,3,3)
plt.hist(tb_is_ac2.age)
plt.title('Apical Two Chambers View: Age by instance')
plt.ylim(0, 12000)
plt.show()

In [None]:
plt.figure(figsize=(18, 5))

plt.subplot(1,3,1)
plt.hist(tb_is_plax.BMI)
plt.title('Parasternal Axis View: BMI by instance')
plt.ylim(0, 15200)

plt.subplot(1,3,2)
plt.hist(tb_is_ac4.BMI)
plt.title('Apical Four Chambers View: BMI by instance')
plt.ylim(0, 15200)

plt.subplot(1,3,3)
plt.hist(tb_is_ac2.BMI)
plt.title('Apical Two Chambers View: BMI by instance')
plt.ylim(0, 15200)
plt.show()