In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('expression.csv')

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.tail()

In [None]:
data_np = np.array(data.values[:, 1:], dtype=np.float64)

In [None]:
data_np

In [None]:
from sklearn.decomposition import NMF

In [None]:
%%time
model = NMF(n_components=5, random_state=1, alpha=0.1, l1_ratio=0.5)
model.fit(data_np)

In [None]:
model.components_.shape

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [None]:
print_top_words(model, data.columns.values[1:], 5)

In [None]:
def get_topics(model, feature_names, n_top_words):
    topics = []
    position = []
    for topic_idx, topic in enumerate(model.components_):
        topics.append([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1] if topic[i] > 0])
        position.append([i+1 for i in topic.argsort()[:-n_top_words - 1:-1] if topic[i] > 0])
    return topics, position

In [None]:
topics, position = get_topics(model, data.columns.values[1:], 64)

In [None]:
people = []
for topic in topics:
    result = data
    for gene in topic:
        result = result[result[gene] > 0]
    people.append(result)

In [None]:
people[0]

# 06 08 10 13 15 16 18 => Ovarian
# 03 05 07 09 11 14 17 19 20 21 22 23 24 => Colon

In [None]:
def get_disease(name):
    if name[:2].lower() == 'bc':
        return 'Breast Cancer'
    if name[:6].lower() == 'run-ca':
        num = int(name[6:8])
        if num in [6, 8, 10, 13, 15, 16, 18]:
            return 'Ovarian Cancer'
        else:
            return 'Colon Cancer'
    if name[:3].lower() == 'org':
        return 'Organoid'
    return 'Normal'

In [None]:
for i, person in enumerate(people):
    print(f'Cluster {i}:')
    for row in person['Patient']:
        print(f'{row}: {get_disease(row)}')
    print('='*50)

In [None]:
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer

In [None]:
for i, person in enumerate(people):
    print(f'For person group {i}')
    kmean = KMeans()
    visualizer = KElbowVisualizer(kmean, k=(1, min(len(person), 6)))
    visualizer.fit(person.iloc[:, position[i]])
    visualizer.poof()

In [None]:
n_each_cluster = [2, 1, 1, 2, 2]

In [None]:
result = []
for i, n in enumerate(n_each_cluster):
    kmean = KMeans(n_clusters=n)
    kmean.fit(people[i].iloc[:, position[i]])
    result.append(kmean.predict(people[i].iloc[:, position[i]]))

In [None]:
result

In [None]:
people[0].iloc[:, [0, 1]].values

In [None]:
for i, person in enumerate(people):
    print(f'Cluster {i}:')
    for row in person['Patient']:
        print(f'{row}: {get_disease(row)}')
    print('='*50)