In [2]:
import pandas as pd

# File paths
fpkm_file = "GSE183947_fpkm.csv"
metadata_file = "GSE183947_series_matrix.txt"

# Step 1: Load FPKM data
print("Loading FPKM data...")
fpkm_data = pd.read_csv(fpkm_file)
fpkm_data = fpkm_data.rename(columns=lambda x: x.strip())  # Clean column names
print(f"FPKM data loaded. Shape: {fpkm_data.shape}")

# Step 2: Extract Metadata
print("Extracting metadata...")
metadata_dict = {}
with open(metadata_file, "r") as f:
    for line in f:
        if line.startswith("!Sample_title") or line.startswith("!Sample_characteristics_ch1"):
            key, *values = line.strip().split("\t")
            metadata_dict[key] = values

# Convert metadata into a DataFrame
metadata = pd.DataFrame(metadata_dict).T
metadata.columns = metadata.iloc[0]  # Set first row as header
metadata = metadata[1:]  # Remove the header row
metadata.reset_index(inplace=True, drop=True)  # Reset index
metadata = metadata.rename(columns=lambda x: x.strip())  # Clean column names
metadata.columns.name = None  # Remove the column name from index

# Clean and adjust metadata sample titles
metadata.columns = [col.replace('"', '').strip() for col in metadata.columns]

# Step 3: Create a mapping between metadata and FPKM columns
# Assuming FPKM columns correspond to metadata samples in order
fpkm_samples = fpkm_data.columns[1:]  # Exclude 'GeneID'
metadata_samples = metadata.columns  # Metadata sample titles
mapping = dict(zip(metadata_samples, fpkm_samples))  # Create a mapping

# Step 4: Merge Data
print("Merging metadata with FPKM data...")
fpkm_data_t = fpkm_data.set_index("Unnamed: 0").T.reset_index()  # Transpose FPKM data
fpkm_data_t.rename(columns={"index": "Sample_title"}, inplace=True)

# Map FPKM sample names to metadata sample names
fpkm_data_t["Sample_title"] = fpkm_data_t["Sample_title"].map({v: k for k, v in mapping.items()})

# Merge metadata and FPKM data
merged_data = pd.merge(metadata.T, fpkm_data_t, left_index=True, right_on="Sample_title")

# Step 5: Save the merged data
output_file = "merged_GSE183947.csv"
merged_data.to_csv(output_file, index=False)
print(f"Merged data saved to {output_file}. Shape: {merged_data.shape}")
merged_data.head()


Loading FPKM data...
FPKM data loaded. Shape: (20246, 61)
Extracting metadata...
Merging metadata with FPKM data...
Merged data saved to merged_GSE183947.csv. Shape: (60, 20248)


Unnamed: 0,0,Sample_title,TSPAN6,TNMD,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,...,CTB-96E2.2,CTC-432M15.3,RP11-986E7.7,RP11-761B3.1,ZBTB8B,RP11-1084J3.4,RP11-944L7.5,FLJ00388,RP11-474G23.1,AC005358.1
0,"""donor: 102548""",tumor rep1,0.93,0.0,0.0,5.78,2.83,4.8,1.37,21.92,...,0.0,0.0,0.0,0.0,38.27,0.0,0.0,0.0,0.0,0.0
1,"""donor: 104338""",tumor rep2,1.97,0.0,0.43,5.17,6.26,1.83,1.78,10.48,...,0.0,0.0,0.1,0.03,30.99,0.16,0.0,0.0,0.0,0.0
2,"""donor: 105094""",tumor rep3,0.0,0.0,0.0,8.76,3.37,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,47.57,0.0,0.0,0.0,0.0,0.0
3,"""donor: 109745""",tumor rep4,5.45,0.0,3.43,4.58,6.24,4.23,2.59,23.78,...,0.0,0.0,1.15,0.0,12.27,0.0,0.0,0.83,0.1,0.0
4,"""donor: 1906415""",tumor rep5,4.52,0.0,8.45,7.2,5.16,15.87,9.21,14.95,...,0.0,0.0,0.0,0.0,4.69,0.0,0.0,0.0,0.64,0.0
