# Stem Cell Gene Expression Analysis
### Simple Version - Just Run All Cells

**Instructions:**
1. Upload your `gene_expression_data.csv` file when prompted below
2. Click Runtime → Run all
3. Wait for results

In [None]:
# Step 1: Upload CSV file
from google.colab import files
print('Upload gene_expression_data.csv file:')
uploaded = files.upload()
print('File uploaded successfully!')

In [None]:
# Step 2: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-darkgrid')
print('✓ Libraries loaded')

In [None]:
# Part 1: Load Data
df = pd.read_csv('gene_expression_data.csv')
print(f'Dataset loaded: {df.shape[0]} cells, {df.shape[1]} features')
print(f'Time points: {sorted(df["Time"].unique())}')
print(f'Cell types: {df["Type"].unique()}')
df.head()

In [None]:
# Part 2: Visualize Bmp4 and Nanog genes
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, gene in enumerate(['Bmp4', 'Nanog']):
    for cell_type in ['E14', 'R1']:
        data = df[df['Type'] == cell_type]
        stats = data.groupby('Time')[gene].agg(['mean', 'std'])
        axes[idx].plot(stats.index, stats['mean'], marker='o', label=cell_type, linewidth=2)
        axes[idx].fill_between(stats.index, stats['mean']-stats['std'], 
                                stats['mean']+stats['std'], alpha=0.2)
    axes[idx].set_title(f'{gene} Expression Over Time')
    axes[idx].set_xlabel('Time (hours)')
    axes[idx].set_ylabel('Expression Level')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
print('✓ Part 2 complete')

In [None]:
# Part 3: Gene Correlations
genes = ['Nanog', 'Pou5f1', 'Sox2', 'Gata6', 'Pax6', 'Sox1', 'Actb', 'Bmp4']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Time 0
corr_t0 = df[df['Time'] == 0][genes].corr()
sns.heatmap(corr_t0, annot=True, fmt='.2f', cmap='coolwarm', center=0, 
            ax=axes[0], vmin=-1, vmax=1, square=True)
axes[0].set_title('Gene Correlations at Time 0h')

# Time 168
corr_t168 = df[df['Time'] == 168][genes].corr()
sns.heatmap(corr_t168, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            ax=axes[1], vmin=-1, vmax=1, square=True)
axes[1].set_title('Gene Correlations at Time 168h')

plt.tight_layout()
plt.show()
print('✓ Part 3 complete')

In [None]:
# Part 4: PCA Analysis
X = df.drop(['Time', 'Type'], axis=1)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print(f'PC1 variance: {pca.explained_variance_ratio_[0]:.1%}')
print(f'PC2 variance: {pca.explained_variance_ratio_[1]:.1%}')

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Colored by time
scatter1 = axes[0].scatter(X_pca[:, 0], X_pca[:, 1], c=df['Time'], 
                           cmap='viridis', alpha=0.6, s=50)
axes[0].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
axes[0].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
axes[0].set_title('PCA - Colored by Time')
plt.colorbar(scatter1, ax=axes[0], label='Time (hours)')

# By cell type
for cell_type, marker in [('E14', 'o'), ('R1', '^')]:
    mask = df['Type'] == cell_type
    scatter2 = axes[1].scatter(X_pca[mask, 0], X_pca[mask, 1], c=df[mask]['Time'],
                               cmap='viridis', marker=marker, label=cell_type, 
                               alpha=0.6, s=50)
axes[1].set_xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%})')
axes[1].set_ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%})')
axes[1].set_title('PCA - Cell Types & Time')
axes[1].legend()
plt.colorbar(scatter2, ax=axes[1], label='Time (hours)')

plt.tight_layout()
plt.show()
print('✓ Part 4 complete')

In [None]:
# Part 5: t-SNE Analysis
print('Running t-SNE (this takes ~30 seconds)...')
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
X_tsne = tsne.fit_transform(X)

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Colored by time
scatter1 = axes[0].scatter(X_tsne[:, 0], X_tsne[:, 1], c=df['Time'],
                          cmap='viridis', alpha=0.6, s=50)
axes[0].set_xlabel('t-SNE 1')
axes[0].set_ylabel('t-SNE 2')
axes[0].set_title('t-SNE - Colored by Time')
plt.colorbar(scatter1, ax=axes[0], label='Time (hours)')

# By cell type
for cell_type, marker in [('E14', 'o'), ('R1', '^')]:
    mask = df['Type'] == cell_type
    scatter2 = axes[1].scatter(X_tsne[mask, 0], X_tsne[mask, 1], c=df[mask]['Time'],
                               cmap='viridis', marker=marker, label=cell_type,
                               alpha=0.6, s=50)
axes[1].set_xlabel('t-SNE 1')
axes[1].set_ylabel('t-SNE 2')
axes[1].set_title('t-SNE - Cell Types & Time')
axes[1].legend()
plt.colorbar(scatter2, ax=axes[1], label='Time (hours)')

plt.tight_layout()
plt.show()
print('✓ Part 5 complete')
print('\n✓✓✓ ALL ANALYSIS COMPLETE ✓✓✓')