## <font color = darkblue>Practice Problem Set 4 - KEY
    
This set of practice problems is to help review performing basic statistics using `scipy` and `statsmodels`. 

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import scipy 
from scipy import stats
import statsmodels.stats.multitest as smm
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols


In [None]:
pd.set_option('display.precision', 2)
pd.set_option('display.max_columns',10)


#### <font color=darkblue> Start by importing  the expression-metadata merged spreadsheet, so that you've got something to work with.

In [None]:
# importing the melanoma dataset
melanoma_log2 = pd.read_excel('melanoma_zerosRemoved_log2transformed_2023.xlsx',index_col = 0)

Like we did in class, extract out the gene expression data specific to the different stages and different cell lines.

In [None]:
# Extracting out only the gene expression data from the normal samples and the metastatic samples
normalExp = melanoma_log2.loc[melanoma_log2.Stage == 'primary melanocytes','A1BG':]
metastaticExp = melanoma_log2.loc[melanoma_log2.Stage == 'metastatic','A1BG':]

# Extracting out only the gene expression dat from each of the cell line samples
FMexp = melanoma_log2.loc[melanoma_log2.cell_line == 'FM','A1BG':]
SK28exp = melanoma_log2.loc[melanoma_log2.cell_line == 'SK_MEL_28','A1BG':]
SK147exp = melanoma_log2.loc[melanoma_log2.cell_line == 'SK_MEL_147','A1BG':]
UACCexp = melanoma_log2.loc[melanoma_log2.cell_line == 'UACC_62','A1BG':]

Then, calculate the overall variance, sort it in descending order, and extract gene names for the top 10 most variably expressed genes.

In [None]:
# calculates the overall variance df.var() and sorts it in descending order
overall_variance = melanoma_log2.loc[:,'A1BG':].var()
overall_variance.sort_values(inplace = True, ascending= False)
overall_variance.head()

# extract gene names for top 10 most variably expressed genes
topvarGens = overall_variance.index[:10]
topvarGens

PMEL     47.63
TYRP1    38.53
AEBP1    36.08
GLUL     33.23
TYR      32.25
dtype: float64

Index(['PMEL', 'TYRP1', 'AEBP1', 'GLUL', 'TYR', 'EEF1A2', 'CDC42EP1', 'A2M',
       'SOD3', 'TGFBI'],
      dtype='object')

### <font color = blue>Comparing Samples:

1. Compare the expression data from the normal samples and the metastatic samples for the gene TYRP1 using a parametric test and then non-parametric test.

    (Hint: You're comparing two samples here.)

In [None]:
# Student's t-test (parametric): tests whether the means of two independent samples are significantly different
stat, p = stats.ttest_ind(normalExp.TYRP1,metastaticExp.TYRP1)
stat
p

7.633007637287068

1.7723103638346948e-05

In [None]:
# Mann-Whitney U-test (non-parametric): tests whether the distributions of two independent samples are equal or not
stat, p = stats.mannwhitneyu(normalExp.TYRP1,metastaticExp.TYRP1)
p

0.008115024287306677

2. Compare the expression data for gene TYRP1 across all four cell lines using a parametric test and then non-parametric test.

In [None]:
# ANOVA (parametric) lets us see if any one of the cell lines has a significant difference in the mean
stat, p = stats.f_oneway(FMexp.TYRP1, SK147exp.TYRP1, SK28exp.TYRP1, UACCexp.TYRP1)
p

1.4222775663570639e-08

In [None]:
# The K-W H test (non-parametric) lets us see if any one of the cell lines has a significant difference in the mean
stat, p = stats.kruskal(FMexp.TYRP1, SK147exp.TYRP1, SK28exp.TYRP1, UACCexp.TYRP1)
p

0.01878535576146058