---
#Notebook di statistica medica (1)
---



# ✅ Librerie python

In [None]:
import os                           # Library to manage files
import numpy as np                  # Library to work with numbers
import pandas as pd                 # Library to work with data
import plotly.express as px         # Library for plotting
import plotly.graph_objects as go   # Library for plotting
import matplotlib.pyplot as plt     # Library for plotting
from scipy import stats             # Library for statistical analysis

def load_csv(nomefile):
  df = pd.read_csv(nomefile)
  return df

def load_excel(nomefile):
  df = pd.read_excel(nomefile, sheet_name = 'data')  
  print(df.columns) #The column labels of the DataFrame.
  return df

def compute_BMI(dati):
  dati['BMI'] = dati['weight'] / (dati['height'] * dati['height'])
  dati.columns = ['sex', 'age', 'height', 'weight', 'BMI']
  return dati

def t_test(df, sample1, sample2):
  return stats.ttest_rel(df[sample1], df[sample2])

def linear_regression(df, serie_x, serie_y, plot=True):
  x = df[serie_x]
  y = df[serie_y]
  b1, b0,_,_,_ = stats.linregress(x,y)

  if plot:
    fig = px.scatter(x=x, y=y, trendline="ols")
    fig.show()

  return b0,b1

def serie_plot(df, campo):
  fig = px.line(df, y=campo, title='Serie: ' + campo)
  fig.show()

def box_plot(df, campo):
  fig = px.box(df, y=campo, points="all")
  fig.show()

def histogram_plot(df, campo, nbins=10):
  fig = px.histogram(df, x=campo, nbins=nbins)
  fig.show()

def scatter_plot(x,y):
  fig = px.scatter(x=x, y=y)
  fig.show()

def QQ_plot(df, campo):
  fig = plt.figure(figsize=(10,10))
  ax = fig.add_subplot()
  stats.probplot(df[campo], dist="norm", plot=ax)
  ax.set_title("Q-Q plot")
  plt.show()

def gaussian_plot(mu=0, sigma=1):
  x = np.linspace(-(mu+10),(mu+10), 1000)    
  G = (2*np.pi*sigma**2)**(-.5) * np.exp(-.5* ((x - mu)/sigma)**2)

  fig = go.Figure()
  fig.add_trace(go.Scatter(x=x, y=G, mode='lines', name='Gaussian'))
  fig.show()

def central_limit(num_dice):                          
  fig, axes = plt.subplots(ncols=len(num_dice), nrows=1, constrained_layout=True, figsize=(len(num_dice)*5, 5))
  for i, n in enumerate(num_dice):
      trials = np.mean(np.random.randint(0,6, (n,100000)), axis = 0)           #we average the number on n dices for 100000 times and plot the distribution of the mean 
      axes[i].hist(trials, bins=100)   
      
def gaussian_data_plot(dati, serie):
  X = dati.values.squeeze()

  # hist
  H, b = np.histogram(X, bins=100)
  H = H/np.max(H)

  # Fit a normal distribution to the data:
  mu, std = stats.norm.fit(X)

  # Plot the PDF.
  x = np.linspace(X.min(), X.max(), 100)
  PDF = stats.norm.pdf(x, mu, std)
  PDF = PDF/np.max(PDF)
  CDF = stats.norm.cdf(x, mu, std)

  fig = go.Figure()
  fig.add_trace(go.Bar(x=b, y=H, name='Histogram'))
  fig.add_trace(go.Scatter(x=x, y=PDF, mode='lines', name='PDF'))
  fig.add_trace(go.Scatter(x=x, y=CDF, mode='lines', name='CDF'))
  fig.show()

def generate_random_normal(n, mu=0, sigma=1):
  values = np.random.normal(mu, sigma, n)
  data = pd.DataFrame(values, columns=['values'])
  print('Num values: %d,  mean = %.3f  -- std = %.3f' % (n, np.mean(values), np.std(values)))
  return data

def generate_random_uniform(n, low=0, high=1):
  values = np.random.uniform(low=low, high=high, size=n)
  data = pd.DataFrame(values, columns=['values'])
  print('Num values: %d,  mean = %.3f  -- std = %.3f' % (n, np.mean(values), np.std(values)))
  return data

# ✅ Dove sono i dati su Google Drive

I file dei dati e dei sorgenti Python devono essere contenute in una directory di Google Drive, per es. la crtella `ortopedia-2022`



In [None]:
#@title Inserire directory di Google Drive
data_dir = "ortopedia-2022" #@param {type:"string"}

drive_dir = "/content/drive/MyDrive/" 

home_dir = drive_dir + data_dir

if not os.path.exists(home_dir):
  print('Direcory inesistente!')
else:
  os.chdir(home_dir)
  print("Directory changed: " + home_dir)


# ✅ Caricamento e visualizzazione dati 

## ▶️ Visualizzazione colonne 

In [None]:
#carica i dati per BMI
dati = load_csv('data/BMI.csv')

In [None]:
# Accesso all'intera tabella

dati

In [None]:
# mostra tabella dati: chiavi che denotano le serie
dati.keys()

In [None]:
# Singola serie

dati['age']

In [None]:
# plotting: cambiare nome della serie, 'weight', 'height'

serie_plot(dati,'height')

## ▶️ Calcolo BMI

Calcolo del BMI per ogni persona e inserimento nella stessa tabella dati.  
$\text{BMI} = \frac{W}{H^2}$

In [None]:
# calcolo BMI e aggiunta a dati

dati = compute_BMI(dati)


In [None]:
# mostra dati

dati['BMI']

## 🔴 Esercizio

Plottare la serie BMI e individuare minimi e massimi

In [None]:
# plot della nuova serie BMI


## ▶️ Selezionare righe

We can specify inidividual rows (subjects) by making use of the `.iloc[]` attribute (or property, which is the term used by pandas) for a dataframe object.  The `iloc` stand for integer location, so we must use integers to specify the row and column number.  We add an index value in square brackets for the property.  Below, we extract the first row.

In [None]:
dati.iloc[0]

We can specify certain rows by passing a list of integer values.

In [None]:
dati.iloc[[2, 3, 5]]

Slicing is also allowed.  This is done by specifying a range of values.

In [None]:
dati.iloc[3:8]  # The first and second row

The columns can also be indexed.  Here we use the *row,column* notation.  Below then, we extract the first five rows, but only for the *DOB* and *Age* variables, which are columns 1 and 2.

In [None]:
dati.iloc[0:5,[1, 2]]

The `.loc[]` property can be used in a similar fashion, but here we can specify the column names (as a list or a slice).  If the index values were not integers, but strings, we could also use those names.  The row and column names are referred to as **labels**.  Below, we extract the same labels as we did above.  Note, though, that the range includes the sixth row.

In [None]:
dati.loc[0:5, ['age', 'BMI']]

## 🔴 Esercizio

- Selezionare la sottotabella che contiene le prime 10 righe per le serie `['weight', 'BMI']`

- Mostrare poi il contenuto della sottotabella 

In [None]:
# effettuare la ricerca delle righe per le serie indicate e salvare la tabella con un nuovo nome


## ▶️ Ordinamento

Sorting can be a useful way to interact with our data.  Below, we change the dataframe object by sorting the *weight*.  All the corresponing column will change as well, so that each row still pertains to the same patient. 

In [None]:
# ordinare i dati per colonna

dati.sort_values(by='weight')

The order can be reversed by using the `acending=False` argument.

In [None]:
# ordine inverso

dati.sort_values(by='height', ascending=False)

We can sort by more than one column at a time.  This is done by passing a list of column names.  Below, we sort by *sex* and the *BMI*.  With default values, numerical and date values will be from smaller to larger values and from earlier to later dates and categorical variables will be alphabetical.

In [None]:
# sorting secondo colonne specificate 

dati.sort_values(by=['sex', 'BMI'])

Not all the column names passed as a list to sort by, need be in the same order.  We can also pass a list with corresponding order.

In [None]:
# sorting con ordine specificato pe ogni colonna 

dati.sort_values(by=['sex', 'BMI'], ascending=[False, False])

The `.nlargest()` method is useful if we only want to view the highest numerical values in a column.  Below, we look at the 5 highest BMI values.

In [None]:
# seleziona i 5 valori maggiori e li mostra

dati.BMI.nlargest(5)

We can reverse the order of the syntax above a bit, if we want to see the rest of the columns too.

In [None]:
# seleziona i 5 valori maggiori e ne mostra le righe
dati.nlargest(5, 'BMI')

The `.sort_value()` method does not make permanent changes to the dataframe, unless the argument `inplace` (which is set to `False` by default) is set to `True`.

In [None]:
# ordinamento permanente in tabella

dati.sort_values(by=['sex', 'BMI'], ascending=[False, False], inplace=True)

# descizione tabella
dati

## 🔴 Esercizio

- Caricare di nuovo i dati e calcolare `BMI`
- Estrarre le sottotabelle suddivise per maschi e femmine
- ordinarle permanentemente per `BMI` 

In [None]:
#carica i dati per BMI

# calcola BMI


# ordina


# mostra


In [None]:
# estrazione sottotabelle F e M



## ▶️ Misure di tendenza centrale

In [None]:
# calcolo della media, std e mediana

BMI_mean = dati['BMI'].mean()
BMI_std = dati['BMI'].std()
BMI_median = dati['BMI'].median()

print('BMI medio  :', BMI_mean)
print('BMI std    :', BMI_std)
print('BMI mediana:', BMI_median)

In [None]:
# mostra dati statistici generali

dati.describe()

In [None]:
# plot histogram della serie 'BMI'

histogram_plot(dati, 'BMI', nbins=10)

## 🔴 Esercizio

Ripetere (alcune) delle operazioni precedenti con il nuovo dataset excel `patients.xlsx`

- caricare i dati
- mostrare descrizione e semplici statistiche
- effettuare plot (serie, box, istogrammi)
- estrarne sottoparti e ordinare

In [None]:
# caricare il file 'patients.xls' dalla cartella 'data'



# ✅ Distribuzione Gaussiana


**Univariate Gaussian** 

The PDF of a univariate Gaussian
$$p(x; \mu, \sigma^2) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp \left ({-\frac{1}{2}\left (\frac{x - \mu}{\sigma} \right )^2} \right )$$

In [None]:
# Distribuzione gaussiana con mu e sigma specificate

gaussian_plot(mu=0, sigma=1)

## 🔴 Esercizio

In [None]:
# plottare una Gaussiana 'alta e magra' e una 'bassa e grassa' a piacere


## Central limit theorem
Gaussian distribution has properties that makes it special. An important property is that the average of iid random variables --under some assumptions-- has a Gaussian distribution. Various forms of this statement are known as **central limit theorems**.
Let's throw different numbers of dice and look at the distribution of their average.


In [None]:

#number of dice to consider
num_dice = [1, 3, 10, 40]    

# each dice is iid with an uniform distribution 2.5 (as the possible numbers on the faces of dice are 0,1,2,3,4,5)
central_limit(num_dice)

In [None]:
# generazione di dati gaussiani
# Params: numero valori = 10000, mu = 0 e sigma = 1
dati = generate_random_normal(10000, 0, 1)

# stampa descrizione dei dati (serie)
dati['values'].describe()

In [None]:
# applica la funzione che mostra istogramma, PDF e CDF gaussiani

gaussian_plot(dati, 'values')

# ✅ Boxplot


## What is a Boxplot?

For some distributions/datasets, you will find that you need more information than the measures of central tendency (median, mean, and mode). You need to have information on the variability or dispersion of the data. A boxplot is a graph that gives you a good indication of how the values in the data are spread out. Although boxplots may seem primitive in comparison to a histogram or density plot, they have the advantage of taking up less space, which is useful when comparing distributions between many groups or datasets.

<img src="https://github.com/giulianogrossi/imgs/blob/main/medical_stats/box_plot.png?raw=true" width="800pt" />

Boxplots are a standardized way of displaying the distribution of data based on a five number summary (“minimum”, first quartile (Q1), median, third quartile (Q3), and “maximum”).

- **median** (Q2/50th Percentile): the middle value of the dataset.

- **first quartile** (Q1/25th Percentile): the middle number between the smallest number (not the “minimum”) and the median of the dataset.

- **third quartile** (Q3/75th Percentile): the middle value between the median and the highest value (not the “maximum”) of the dataset.

- **interquartile range** (IQR): 25th to the 75th percentile.

- **whiskers** (shown in blue)

- **outliers** (shown as green circles)

- **“maximum”**: Q3 + 1.5*IQR

- **“minimum”**: Q1 -1.5*IQR

## Boxplot on a Normal Distribution

<img src="https://github.com/giulianogrossi/imgs/blob/main/medical_stats/box_plot_Gauss.png?raw=true" width="800pt" />

The image above is a comparison of a boxplot of a nearly normal distribution and the probability density function (pdf) for a normal distribution. The reason why I am showing you this image is that looking at a statistical distribution is more commonplace than looking at a box plot. In other words, it might help you understand a boxplot.

In [None]:
# generazione di dati gaussiani
# Params: numero valori = 10000, mu = 0 e sigma = 1
dati = generate_random_normal(1000, 0, 1)


# Calculate the first quartile
q_1 = dati['values'].quantile(.25)

# Calculate the second quartile 
q_2 = dati['values'].quantile(.5)

# Calculate the third quartile 
q_3 = dati['values'].quantile(0.75)

print('The 1st quartile: ', q_1)
print('The 2nd quartile: ', q_2)
print('The 3rd quartile: ', q_3)

#✅ The QQ plot

<p>A parametric test is used when the sample data is taken from a normal distribution and a non-parametric test if it is not.  Wow!  We only have our sample data.  How can we know this?  Well, there's the probability plot from the *scipy.stats* library.  It is more commonly known as the QQ-plot.  It plots every data point against its quartile on a Cartesian plane.  It then gives you a visual representation of how correlated these are.  If they are not, the data is not from a normal distribution and you have to use a non-parametric test.</p>

<p>First, I'll generate 40 random values from the normal distribution, and constrain it to a mean of 100 and a standand deviation of 20.</p>

In [None]:
# generazione di dati normalmente distribuiti

dati = generate_random_normal(1000, 100, 10)
print(dati['values'].describe())

# istogramma ~ distribuzione di probabilità
histogram_plot(dati, 'values', nbins=100)

In [None]:
# generazione di dati uniformemnete distribuiti

dati = generate_random_uniform(10000)
dati['values']

# istoramma
histogram_plot(dati, 'values', nbins=50)


**Now let's do the QQ plot**



In [None]:
# generazione di dati normalmente distribuiti
# params: num dati, nu e sigma

dati = generate_random_normal(1000, 10, 2)

QQ_plot(dati, 'values')

In [None]:
# generazione di dati uniformemnete distribuiti

dati = generate_random_uniform(1000)

QQ_plot(dati, 'values')

## 🔴 Esercizio

Q-Q plot per il dataset (excel) `patients.xlsx`

- Stabilire quali delle seguenti serie sono più gaussiane
```
['HR', 'sBP' 'CholesterolBefore', 'TAG', 'Survey', 'CholesterolAfter']
```

In [None]:
# caricare il file 'patients.xls' dalla cartella 'data'


# ✅ Regressione lineare


**Definition & Working principle**

Linear regression is a supervised learining algorithm used when target / dependent variable continues real number. It establishes relationship between dependent variable 
$y$ and one or more independent variable $x$ using best fit line. 

 The goal is to minimize sum of square difference between observed dependent variable in the given data set and those predicted by linear regression fuction.

**Hypothesis representation** 

We will use $x_i$ to denote the independent variable and  $y_i$ to denote dependent variable. A pair of  $(x_i,y_i)$ is called training example, $i=1,2,..,m$. 

The goal of supervised learning is to learn a **hypothesis function**  $\hat{y}(x)$ for a given training set that can used to estimate $y$ based on $x$. So hypothesis fuction represented as
$$ \hat{y}(x)=b_0+b_1x$$

where $b_0,b_1$ are parameter of hypothesis. This is equation for Simple / Univariate Linear regression.

**Least Squares**

The equation of the line to determine, we need to minimise the quantity

$$E=\sum_{i=1}^{n}(y_i - \hat{y_i})^2 = \sum_{i=1}^{n}(y_i - b_0 - b_1 x_i)^2$$

where $b_0$ (intercept) and $b_1$ (slope) are given by

$$\begin{cases} \left(\sum_{i=1}^{n} x_i^2\right) b_1 + \left(\sum_{i=1}^{n} x_i\right) b_0 = \sum_{i=1}^{n} x_i y_i \\ \left(\sum_{i=1}^{n} x_i\right) b_1 + n b_0 = \sum_{i=1}^{n}y_i \end{cases}$$


<img src="https://github.com/giulianogrossi/imgs/blob/main/medical_stats/plinear.png?raw=true" width="800pt" />


<img src="https://github.com/giulianogrossi/imgs/blob/main/medical_stats/nlinear.png?raw=true" width="800pt" />



Regressione lineare tra:
- W and BMI
- H and BMI

In [None]:
#carica i dati per BMI
dati = load_csv('data/BMI.csv')  

# calcola BMI
dati = compute_BMI(dati)

# regressione lineare tra 'weight' e 'BMI'
b0, b1 = linear_regression(dati, 'weight', 'BMI')
print(f"The line equation is y = {b1} x + {b0}")

## 🔴 Esercizio

Applicare la regressione al dataset (excel) `patients.xlsx`

- caricare i dati
- mostrare descrizione e semplici statistiche
- calcolare retta di regressione con 'HR', 'sBP'

In [None]:
# caricare il file 'patients.xls' dalla cartella 'data'
