In [None]:
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()

Credits: based on https: // www.kaggle.com/crawford/principle-component-analysis-gene-expression/notebook

# PARTE 1: PCA con los datos de genes de (Golub et al.) https://www.kaggle.com/crawford/principle-component-analysis-gene-expression/

Datos usados para clasificar pacientes con acute myeloid leukemia (AML) and acute lymphoblastic leukemia (ALL).

Golub et al "Molecular Classification of Cancer: Class Discovery and Class
Prediction by Gene Expression Monitoring"

There are two datasets containing the initial (training, 38 samples) and independent (test, 34 samples) datasets used in the paper. These datasets contain measurements corresponding to ALL and AML samples from Bone Marrow and Peripheral Blood. Intensity values have been re-scaled such that overall intensities for each chip are equivalent.

# Analysis steps

1. Remove columns that contain "Call" data
2. Transpose the dataframe so that each row is a patient and each column is a gene
3. Remove gene description header and set the gene accession numbers as the column headers
4. Split into train/test sets
5. Scale values to zero mean and unit varaince
6. PCA analysis
7. To do: K-means cluster

## (1) Load data

In [None]:
testfile = '../Datasets/genes/data_set_ALL_AML_independent.csv'
trainfile = '../Datasets/genes/data_set_ALL_AML_train.csv'
labels = '../Datasets/genes/genes.actual.csv'

X_train = pd.read_csv(trainfile)
X_test = pd.read_csv(testfile)
y = pd.read_csv(labels)

In [None]:
# 1)  Remove "call" columns from training a test
train_keepers = [col for col in X_train.columns if "call" not in col]
test_keepers = [col for col in X_test.columns if "call" not in col]

X_train = X_train[train_keepers]
X_test = X_test[test_keepers]

In [None]:
# 2) Transpose
X_train = X_train.T
X_test = X_test.T
X_train

In [None]:
# 3) Clean up the column names for training data
X_train.columns = X_train.iloc[1]
X_train = X_train.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)

# Clean up the column names for training data
X_test.columns = X_test.iloc[1]
X_test = X_test.drop(["Gene Description", "Gene Accession Number"]).apply(pd.to_numeric)

X_train.head()

In [None]:
# 4) Split into train and test 
X_train = X_train.reset_index(drop=True)
y_train = y[y.patient <= 38].reset_index(drop=True)

# Subet the rest for testing
X_test = X_test.reset_index(drop=True)
y_test = y[y.patient > 38].reset_index(drop=True)


## Exploratory data analysis

Realiza un análisis exploratorio de los datos (correlaciones entre sí y con las clases, distribuciones,...). Usa las técnicas y gráficos que te parezcan más representativos.

## (2) Principle Component Analysis

The analysis reveals that 21 principle components are needed to account for 80% of the variance. PC 1-3 add up to  about ~33% and the rest is a slow burn where each component after PC8 contributes between 1-2% of the variance up until PC38 which is essentially zero. 1% is a decent amonut of variance and so the number of important PCs is up for interpretation. 

In [None]:
# 5) Scale data 
# (1) YOUR CODE HERE: Use the StandardScaler (separately for train and test sets)


In [1]:
# 6) PCA Analysis and projection
components = 21
# YOUR CODE HERE: 
# (2) Use PCA with this number of components on train set, with Y the result of the procedure

# (3) Retrieve the explained variance ratio, and compute its accumulative sum
# save those values in variables var_exp and cum_var_exp


In [None]:
print(var_exp)
print(cum_var_exp)

**Pregunta (1)**: ¿Qué pauta puede observarse en los valores de var_exp? ¿Cuál es la interpretación relativa de esos valores?

In [None]:
# Plot the explained variance using var_exp and cum_var_exp
x = ["PC%s" %i for i in range(1,components)]
trace1 = go.Bar(
    x=x,
    y=list(var_exp),
    name="Explained Variance")

trace2 = go.Scatter(
    x=x,
    y=cum_var_exp,
    name="Cumulative Variance")

layout = go.Layout(
    title='Explained variance',
    xaxis=dict(title='Principle Components', tickmode='linear'))

data = [trace1, trace2]
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)

## (3) Projection of first three components
The first three components only explain 33% of the variance but we'll go ahead plot the projection to get a visual of it. 

In [None]:
# Project first three components
Y_train_pca = pca.fit_transform(X_train_scl)

traces = []
for name in ['ALL', 'AML']:
    trace = go.Scatter3d(
        x=Y_train_pca[y_train.cancer == name, 0],
        y=Y_train_pca[y_train.cancer == name, 1],
        z=Y_train_pca[y_train.cancer == name, 2],
        mode='markers',
        name=name,
        marker=go.Marker(size=10, line=go.Line(width=1), opacity=1))

    traces.append(trace)

layout = go.Layout(
    xaxis=dict(title='PC1'),
    yaxis=dict(title='PC2'),
    title="Projection of First Three Principle Components"
)

data = traces
fig = go.Figure(data=data, layout=layout)
offline.iplot(fig)


**Pregunta(2)**: Modificando la perspectiva de la figura con el ratón, ¿qué observas en cuanto a la separabilidad de las clases? Adjunta una imagen que apoye tus conclusiones.

# Parte 2: Linear Discriminant Analysis

In [None]:
X_train_scl.shape

In [None]:
# LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
sklearn_lda = LDA(n_components=1)

# Y_train_pca = pca.fit_transform(X_train_scl)

X_lda_sklearn = sklearn_lda.fit_transform(X_train_scl, y['cancer'][:38])


In [None]:

from matplotlib import pyplot as plt 

def plot_step_lda():

    ax = plt.subplot(111)
    for label, marker, color in zip(
            range(1, 4), ('^', 's', 'o'), ('blue', 'red', 'green')):

        plt.scatter(x=X_lda_sklearn[:, 0].real[y == label],
                    y=X_lda_sklearn[:, 1].real[y == label],
                    marker=marker,
                    color=color,
                    alpha=0.5,
                    label=label_dict[label]
                    )

    plt.xlabel('LD1')
    plt.ylabel('LD2')

    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title('LDA: Iris projection onto the first 2 linear discriminants')

    # hide axis ticks
    plt.tick_params(axis="both", which="both", bottom="off", top="off",
                    labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()

def plot_scikit_lda(X, title):

   ax = plt.subplot(111)
   for label, marker, color in zip(
           range(1, 4), ('^', 's', 'o'), ('blue', 'red', 'green')):

       plt.scatter(x=X[:, 0][y == label],
                   y=X[:, 1][y == label] * -1,  # flip the figure
                   marker=marker,
                   color=color,
                   alpha=0.5,
                   label=label_dict[label])

   plt.xlabel('LD1')
   plt.ylabel('LD2')

   leg = plt.legend(loc='upper right', fancybox=True)
   leg.get_frame().set_alpha(0.5)
   plt.title(title)

   # hide axis ticks
   plt.tick_params(axis='both', which='both', bottom='off', top='off',
                   labelbottom='on', left='off', right='off', labelleft='on')

   # remove axis spines
   ax.spines['top'].set_visible(False)
   ax.spines['right'].set_visible(False)
   ax.spines['bottom'].set_visible(False)
   ax.spines['left'].set_visible(False)

   plt.grid()
   plt.tight_layout
   plt.show()


plot_step_lda()
plot_scikit_lda(X_lda_sklearn, title='Default LDA via scikit-learn')
