In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# helper function for loading expression data
from src.load_data import load_expression_data

In [None]:
# load tab-separated RNA sequence expression data
# preview first few rows to ensure loading worked
# rows indicate gene names
df = pd.read_csv("../data/expression_data.csv", sep="\t")
df.head()


In [None]:
# check dimensions of dataset
df.shape
df.columns

In [None]:
# set gene names as the index
df = df.set_index("gene_name")

# save IDs as reference for genes
gene_ids = df["gene_id"]

# keep only expression columns
df = df.drop(columns=["gene_id"])


In [None]:
# handle non-numeric values by converting them to numeric
df = df.apply(pd.to_numeric, errors="coerce")

# fill remaining values with 0
if df.isnull().any().any():
    df = df.fillna(0)


In [None]:
# calculate mean expression for each gene across all samples
# keep genes with a mean greater than 1
gene_means = df.mean(axis=1)
df_filtered = df.loc[gene_means > 1]


In [None]:
# apply log(x+1) to handle variance and skew
df_log = np.log1p(df_filtered)


In [None]:
# plot raw expression values
plt.figure()
plt.hist(df_filtered.values.flatten(), bins=50)
plt.title("Raw Expression Distribution")
plt.xlabel("Expression")
plt.ylabel("Frequency")
plt.show()

# plot log-normalized expression values
plt.figure()
plt.hist(df_log.values.flatten(), bins=50)
plt.title("Log-Normalized Expression Distribution")
plt.xlabel("log(Expression + 1)")
plt.ylabel("Frequency")
plt.show()


In [None]:
# calculate variance of genes across samples
gene_variance = df_log.var(axis=1)
# get top 30 most variable genes and create a headmap
top_genes = gene_variance.sort_values(ascending=False).head(30).index

plt.figure(figsize=(10, 6))
sns.heatmap(df_log.loc[top_genes], cmap="viridis")
plt.title("Top 30 Most Variable Genes")
plt.show()


In [None]:
# save cleaned data for reuse
df_log.to_csv("../data/expression_cleaned_log.csv")
