<a href="https://colab.research.google.com/github/chjohnso-um/chjohnso-um-CSC-587/blob/main/Normalization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

#Remove columns with 50% or more missing values

In [2]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('your_file.csv')

# Drop columns where 50% or more values are missing
threshold = len(df) * 0.5
df_cleaned = df.dropna(axis=1, thresh=threshold)

# Optionally, save the cleaned DataFrame to a new file
df_cleaned.to_csv('insert_name', index=False)


#Remove columns that the the BL and V06 data do not share

In [3]:
import pandas as pd

# Load the two cleaned CSV files from previous step
df1 = pd.read_csv('your_file1.csv')
df2 = pd.read_csv('your_file2.csv')

# Find common columns between the two DataFrames
common_cols = df1.columns.intersection(df2.columns)

# Keep only the common columns
df1_common = df1[common_cols]
df2_common = df2[common_cols]

# Save to new CSV files
df1_common.to_csv('insert_name1.csv', index=False)
df2_common.to_csv('insert_name2.csv', index=False)


#Fill empty cells with the columns median value

In [5]:
import pandas as pd

# Load your DataFrame (example with one file, repeat if needed)
df = pd.read_csv('saved_file_from_last_step')

# Fill missing values in each column with its median
df_filled = df.fillna(df.median(numeric_only=True))

# Save the result to a new CSV (or overwrite)
df_filled.to_csv('insert_name', index=False)


#Two-way normalization of the datasets. Quantile for the rows and z-score normalization for the columns.

In [9]:
import pandas as pd
import numpy as np
from scipy.stats import rankdata

# Load your data
df = pd.read_csv('your_imputed_file.csv')

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Convert numeric columns to float (prevents dtype warnings)
df[numeric_cols] = df[numeric_cols].astype(float)

# 1. Quantile Normalization (row-wise)
def quantile_normalize(df_input):
    # Sort each row and take the mean for each rank
    sorted_df = pd.DataFrame(
        np.sort(df_input.values, axis=1),
        index=df_input.index,
        columns=df_input.columns
    )
    rank_means = sorted_df.mean(axis=0).values

    # Replace values by rank means
    normalized = df_input.copy()
    for i in range(df_input.shape[0]):
        ranks = rankdata(df_input.iloc[i, :], method='min') - 1  # 0-based index
        normalized.iloc[i, :] = [rank_means[r] for r in ranks]

    return normalized

# Apply quantile normalization
df_quantile = quantile_normalize(df[numeric_cols])

# 2. Z-score Normalization (column-wise)
df_zscore = (df_quantile - df_quantile.mean()) / df_quantile.std()

# Reattach non-numeric columns
df_final = pd.concat([df[non_numeric_cols].reset_index(drop=True), df_zscore.reset_index(drop=True)], axis=1)

# Save the final output
df_final.to_csv('normalized_quantile_file.csv', index=False)

#Two-way normalization of the datasets. Sum-to-1 for the rows and z-score normalization for the columns.

In [2]:
import pandas as pd
import numpy as np

# Load your data
df = pd.read_csv('your_imputed_file.csv')

# Separate numeric and non-numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Convert numeric to float (to avoid warnings and support division)
df[numeric_cols] = df[numeric_cols].astype(float)

# 1. Row-wise sum-to-1 normalization
df_sum1 = df[numeric_cols].div(df[numeric_cols].sum(axis=1), axis=0)

# 2. Column-wise z-score normalization
df_zscore = (df_sum1 - df_sum1.mean()) / df_sum1.std()

# Reattach non-numeric columns
df_final = pd.concat([df[non_numeric_cols].reset_index(drop=True), df_zscore.reset_index(drop=True)], axis=1)

# Save the final result
df_final.to_csv('normalized_sum_file.csv', index=False)
