# 1. Preprocessing

This notebook includes the following tasks:
1. Load the dataset
2. Explore the dataset

In [None]:
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.cluster import hierarchy
import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyo
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
from sklearn.decomposition import PCA
from scipy.spatial.distance import pdist, squareform
# Set notebook mode to work in offline
pyo.init_notebook_mode()

### 1.1 Load the dataset

#### 1.1.1 Matrix with observations and features (X)

Please load the file and make it into the format as:
- features as rows
- observations as columns
- all row index and column names should be unique.
- all values in the dataframe should be numerical. All metadata should be cleaned.

Please edit the codes below for loading your data.

In [None]:
# Load the matrix with observation as columns and features (X) as rows
input_path = "inputs/test_norm_exp_quant.csv"
data = pd.read_csv(input_path, sep=",", index_col=0)
data.drop(columns=['gene_id'], inplace=True)
data = data.groupby('gene_name').mean()
print(data.shape)
print("#### Data index ####")
print(data.index[0:5])
print("#### Data columns ####")
print(data.columns[0:5])

In [None]:
# If you want to extract a subset of the data for a test run, please use the codes below.
# This line should be commented out for training the real model.
# data = data.head(1000)

#### 1.1.2 Handle Missing Values

In [None]:
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

In [None]:
# Decide on the method to handle missing values
# For example, let's fill missing values with the mean of the column
# data = data.fillna(data.mean())

#### 1.1.3 Check the rows with all zeros

In [None]:
count = len(data[(data == 0).all(axis=1)])
print("Number of rows with all zeros: "+str(count))
if count > 0:
    data = data[~(data == 0).all(axis=1)]
    print(str(count)+" rows are removed.")

#### 1.1.4. Identify outliers using z-scores

In [None]:
z_scores = np.abs((data - data.mean()) / data.std())
outliers = (z_scores > 3).sum().sort_values(ascending=False)
outliers

In [None]:
# Decide on the approach to handle outliers
# For example, let's remove rows with outliers
data = data[(z_scores <= 10).all(axis=1)]

#### 1.1.4 Load labels (y)

Please make sure the observation names are consistent to the column names in the above matrix (X)

In [None]:
# Load the named Series with labels (y) as the content and the observation names as names
labels_path = "inputs/test_labels.csv"
labels = pd.read_csv(labels_path, sep="\t", index_col=0).squeeze()
print("#### Labels ####")
print(labels.iloc[0:5])

In [None]:
# Check whether all row names are unique
if data.index.duplicated().any():
    print("Row names are not unique.")
# Check whether all column names are unique
if data.columns.duplicated().any():
    print("Column names are not unique.")
# Check whether there is any non-numeric values in data
is_numeric = pd.to_numeric(data.stack(), errors='coerce').notnull().all()
if not is_numeric:
    print("Some values are not numeric.")

# 1.2 Explore the dataset

This section is for understanding your dataset statistically and visually.

### 1.2.1 Description

In [None]:
data.describe()

In [None]:
print(data.shape)
print(data.info())
# print(data.head())

### 1.2.2 Process the data for visualization

In [None]:
# Transform the data into log10 scale for further visualization
data_viz = np.log10(data+1)
# data_viz = data_viz.reset_index()
data_long = pd.melt(data_viz, id_vars=data_viz.columns[0], var_name='sample', value_name='ExpLog10')
# Merging data long table with labels (y)
data_long = pd.merge(data_long, labels, left_on='sample', right_on="Sample name")

### 1.2.3. PCA

In [None]:
# Apply PCA to reduce dimensionality to 3 components
pca = PCA(n_components=3)
pca_data = pca.fit_transform(data_viz.transpose())
pca_data = pd.DataFrame(pca_data)
pca_data.columns = ["PC 1", "PC 2", "PC 3"]
pca_data["sample"] = data_viz.columns
pca_data = pd.merge(pca_data, labels, left_on='sample', right_on="Sample name")
fig = px.scatter_3d(pca_data, x='PC 1', y='PC 2', z='PC 3', color='Remarks', hover_name="sample")
fig.show()

In [None]:
# Create a box plot using Seaborn
sns.boxplot(data=data_viz)

# Set the title and labels
plt.title('Box Plot')
plt.xlabel('Group')
plt.ylabel('Values')

# Show the plot
plt.show()