In [1]:
import pandas as pd

df = pd.read_csv("unc.edu_PANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv", sep="\t")  
gene_names = df['gene_id'].unique().tolist()
print(df['gene_id'].unique()[:5])  # prints first 5 unique gene names

['?|100130426' '?|100133144' '?|100134869' '?|10357' '?|10431']


In [2]:
print("Number of instances (rows):", df.shape[0])
print("Number of features (columns):", df.shape[1])


Number of instances (rows): 20531
Number of features (columns): 10472


In [3]:
# Load expression file (with dummy gene names)
expr_df = pd.read_csv("data.csv")

# Load gene IDs
gene_ids_df = pd.read_csv("gene_ids_updated.tsv")
real_gene_ids = gene_ids_df["gene_id"].tolist()

# Confirm shape match
assert len(expr_df.columns) - 1 == len(real_gene_ids), "Column mismatch"

# Replace columns safely
expr_df.columns = [expr_df.columns[0]] + real_gene_ids


In [5]:
import pandas as pd

# (Re‐load or continue from your existing expr_df)
expr_df = pd.read_csv("data.csv")       # your expression matrix
gene_ids_df = pd.read_csv("gene_ids_updated.tsv")  
real_gene_ids = gene_ids_df["gene_id"].tolist()  
assert len(expr_df.columns) - 1 == len(real_gene_ids)

# Step 1: assign the “GENE_SYMBOL|ID” strings as column names
expr_df.columns = [expr_df.columns[0]] + real_gene_ids

# Step 2: strip off everything after (and including) the ‘|’ in each gene column
first_col = expr_df.columns[0]  # e.g. "SampleID" or whatever your metadata header is
rest_cols = expr_df.columns[1:]
stripped = [col.split("|")[0] for col in rest_cols]

# Step 3: re‐assemble the new column list
expr_df.columns = [first_col] + stripped

# (Now expr_df.columns looks like ["SampleID", "VPS13A_AS1", "UBE2Q2P3", … ])
# You can save it out if you like:
expr_df.to_csv("cleaned_gene_expression.csv", sep="\t", index=False)


In [6]:
print(expr_df.shape)  # Should match original: (num_samples, num_genes + 1)


(801, 20532)


In [7]:
print(expr_df.columns[:10])


Index(['Unnamed: 0', 'VPS13A_AS1', 'UBE2Q2P3', 'UBE2Q2P2', 'HMGB1P1',
       'TIMM23B', 'MOXD2P', 'LOC155060', 'RNU12_2P', 'SSX9P'],
      dtype='object')


In [8]:
print(expr_df.columns.duplicated().sum())  # Should be 0
print(expr_df.columns.isnull().sum())      # Should be 0


2
0


In [9]:
expr_df.to_csv("expression_with_real_gene_names.csv", index=False)
print("Gene names successfully replaced and saved to expression_with_real_gene_names.csv")

Gene names successfully replaced and saved to expression_with_real_gene_names.csv


In [10]:
df = pd.read_csv("expression_with_real_gene_names.csv") 
print("Number of instances (rows):", df.shape[0])
print("Number of features (columns):", df.shape[1])

Number of instances (rows): 801
Number of features (columns): 20532


In [11]:
import pandas as pd

# (Assuming expr_df is already loaded and cleaned as you showed.)
cols = expr_df.columns

# Get a Boolean mask of which labels are duplicates (keeps first occurrence = False)
dup_mask = cols.duplicated(keep='first')

# Print how many duplicates you have (should match .sum()=2)
print("Total duplicates:", dup_mask.sum())

# Show the actual duplicated labels
print("Duplicated names:", cols[dup_mask].unique().tolist())


Total duplicates: 2
Duplicated names: ['RNF213', 'SLC35E2']


In [12]:
import pandas as pd

# Example: your existing DataFrame
# expr_df = pd.read_csv("...")

# Step A: collect all column names (including the first metadata column)
orig_cols = expr_df.columns.tolist()

# Step B: build a new list where duplicates get a “_n” suffix
new_cols = []
counts = {}  # counts[label] = how many times we've seen it so far

for name in orig_cols:
    if name not in counts:
        counts[name] = 1
        new_cols.append(name)
    else:
        counts[name] += 1
        new_cols.append(f"{name}_{counts[name]-1}")

# Now assign back:
expr_df.columns = new_cols

# Verify no duplicates remain:
print("Duplicates after renaming:", expr_df.columns.duplicated().sum())  # should be 0


Duplicates after renaming: 0
