# EDA template

In [None]:
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline 
#ensure that plots are displayed inside the notebook
import pyodbc
import seaborn as sns
import JSON
from lxml import objectify

## <u> Reading the data in a dataframe

In [None]:
# csv
df = pd.read_csv("file.csv", index_col=0, header=0, parse_dates=True, delimiter=';')

In [None]:
# HTML
url = "https://www. ..."
df = pd.read_html(url, index_col=0)

In [None]:
# JSON
df = pd.read_json("file.json")

In [None]:
# local database
DRIVER_DB = 'MySQL ODBC 8.0 ANSI Driver'
SERVER_DB = '127.0.0.1'
NAME_DB = 'db'
USERNAME_DB = 'usr'
PASSWORD_DB = 'pwd'

cnxn = pyodbc.connect(DRIVER=DRIVER_DB, SERVER=SERVER_DB, DATABASE=NAME_DB, UID=USERNAME_DB, PWD=PASSWORD_DB)
df = pd.read_sql_query("SELECT * FROM TABLE_patients;", cnxn)

## <u> EDA


In [None]:
# get the name of the columns in the dataframe
print("The columns are:", list(df.columns))

In [None]:
# rename columns
df = df.rename(columns={'oldX':'newX', 'oldY':'newY'})

In [None]:
# print the dataframes's first 5 rows
print("The first 5 rows are:\n")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(df.head(5))

In [None]:
# print the distinct values in variable X
print("The distinct values of variable X are:", df['X'].unique())

In [None]:
# count the records
print("The number of records is:", len(df))

In [None]:
# count the missing entries per column
a = df.isnull().sum()>0
print("The number of missing values per column is:")
print(df.isnull().sum()[a])

### Dealing with missing entries:
1) <u>Remove the rows with NA entries:</u>
&nbsp;&nbsp;  df = df.dropna()

2) <u>Remove the columns with NA entries:</u>
&nbsp;&nbsp;  df = df.drop(['X'], axis = 1)

3) <u>Imput based on mode (if categorical)/ mean (if numberical)</u>
&nbsp;&nbsp;  df['X'].fillna(df['X'].mode(), inplace=True) OR df['X'].fillna(df['X'].mean(), inplace=True)

4) <u>Imput based on other highly correlated variable</u>

5) <u>Imput based on prediction model</u>



In [None]:
# check the type of X
print("The type of X is:", type(df['X'][0]))

In [None]:
# convert X to datetime
df['X'] = pd.to_datetime(df['X'])

In [None]:
# converts X to float
df['X'] = df['X'].astype(float)

In [None]:
# summarize the variable X of the df
print("The descriptive statitics are:")
print(df['X'].describe())

###  Plots

In [None]:
# plot the scatter plot between variable X and Y
df.plot.scatter(x=X, y=Y, title='Scatter plot X against Y');

In [None]:
# plot the histogram of X
df['X'].plot.hist(bins=10, title="Histogram of X");

In [None]:
# plot the boxplot between numerical variable X and categorical Y
sns.boxplot(x=df['X'], y=df['Y']);

In [None]:
# plot the ditribution of the variable X of the df (to look for outliers/missing values)
sns.distplot(df['X']);

### Dealing with outliers:
 1) <u>Remove the rows:</u>
&nbsp;&nbsp;  df = df.dropna()

2) <u>Input based on other variables of the record</u>

3) <u>Imput based on mode (if categorical)/ mean (if numberical)</u> 
&nbsp;&nbsp;  df['X'].fillna(df['X'].mode(), inplace=True) OR df['X'].fillna(df['X'].mean(), inplace=True)


### Variables correlated with BP
- pearson: assumes all variables are normally distributed, linear and homoscedastic
- spearman: appropriate when the variables do not meed the assumptions for Pearson and the data is large with few tied ranks
- kendall: appropriate when the variables do not meed the assumptions for Pearson and the data is small with many tied ranks

In [None]:
# computes correlation coefficient between variable X and Y
scipy.stats.pearsonr(df['X'], df['Y'])
scipy.stats.spearmanr(df['X'], df['Y'])
scipy.stats.kendalltau(df['X'], df['Y'])

In [None]:
# plots a heatmap with the correlations among all variables within the df
corrmat = df.corr(method="spearman")
sns.heatmap(corrmat, square=True, annot=True, vmin = -1, vmax = 1);

In [None]:
# plots a heatmap with the correlations among the top 10 variables correlated with X
corrmat = df.corr(method="spearman")
cols = corrmat.nlargest(10, 'X').index
sns.heatmap(corrmat.loc[cols, cols], annot=True);

### Test for significant diferences between samples

In [None]:
# checks if a sample is normally distributed
def is_normal(sample, significance_level):
    p_value = scipy.stats.shapiro(sample)[1] # Shapiro-Wilk test is a Goodness of Fit test that checks if a distribution is normal
    if p_value > significance_level:
        return True
    else:
        return False

In [None]:
# tests if there are significant differences between two samples -> returns p-value -> if p-value > significance level -> No significant differences
def compare(sample1, sample2, significance_level):
    if is_normal(sample1, significance_level) and is_normal(sample2, significance_level):
        print(scipy.stats.ttest_ind(sample1, sample2)) # t-test is used because both samples are normally distributed
    else:
        print(scipy.stats.ranksums(sample1, sample2)) # Wilcoxon rank-sum test was used because at least one of the sample is not normally distributed