In [4]:
import os
from bs4 import BeautifulSoup, NavigableString
import pandas as pd

# os.chdir('..')
os.getcwd()

'/workspaces/LW15-Functional-Screening'

In [21]:
# Load the CSV file
df = pd.read_csv('TCGA/gene_p_values.csv')

# Convert gene names to uppercase for consistency
df['gene_name'] = df['gene_name'].str.upper()

# Format the p-values with 4 decimal places
df['p_value'] = df['p_value']

# Create a dictionary for quick lookup
p_value_dict = df.set_index('gene_name')['p_value'].to_dict()
p_value_dict

{'CASKIN1': 0.12296241818721,
 'SCARF2': 0.007467827214788,
 'VPS4A': 0.15476581162066,
 'CARD10': 0.520710451287842,
 'THAP11': 0.621028946339941,
 'NUP50': 0.816810377224621,
 'RRAD': 0.136334673076583,
 'EPS8L2': 0.251730414439408,
 'REEP6': 0.006932655771429,
 'CCDC92': 0.192615827400949,
 'GFER': 0.011634787061601,
 'SHF': 0.390283852169065,
 'STAC2': 0.344340964711277,
 'SPTBN4': 0.15241687646477,
 'B4GALT3': 0.002127586750217,
 'SH3BP1': 2.48509967e-07,
 'RASGRP4': 0.428732995942924,
 'SPATA2L': 0.726841491048477,
 'ARID3C': 1.7948565e-08,
 'VILL': 0.151633152309346,
 'RFNG': 0.017004124984403,
 'NLGN2': 0.000102149800789,
 'SYCE2': 2.807541234e-06,
 'PRPH': 0.327855226856989,
 'MKRN1': 0.168268442614802,
 'GPR157': 0.855560270911085,
 'FOXA3': 0.075570170239164,
 'ZCWPW1': 0.01243312605273,
 'PRRX2': 0.001152841855576,
 'LDHD': 1.660245212e-06,
 'STARD10': 0.732432555878066,
 'ARSI': 0.000760968627735,
 'PLVAP': 4.8352065545e-05,
 'PRSS16': 0.91610070425323,
 'FAM117A': 0.57015

In [6]:
# Load the HTML content
with open('index.html', 'r') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

# Find the table by ID
table = soup.find('table', {'id': 'functionalScreening'})

# Iterate over each row in the table
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    if not cells:
        continue  # Skip header row if not already skipped
    gene_name = cells[0].text.strip().upper()  # Assuming gene name is in the first cell
    p_value = p_value_dict.get(gene_name)
    
    # add NA to the dict
    cells[-1].clear()
    if p_value is not None:
        next
    else:
        p_value_dict[gene_name] = 'NA'
        
p_value_dict


{'CASKIN1': 0.12296241818721,
 'SCARF2': 0.0074678272147877,
 'VPS4A': 0.15476581162066,
 'CARD10': 0.520710451287842,
 'THAP11': 0.621028946339941,
 'NUP50': 0.816810377224621,
 'RRAD': 0.136334673076583,
 'EPS8L2': 0.251730414439408,
 'REEP6': 0.0069326557714287,
 'CCDC92': 0.192615827400949,
 'GFER': 0.0116347870616009,
 'SHF': 0.390283852169065,
 'STAC2': 0.344340964711277,
 'SPTBN4': 0.15241687646477,
 'B4GALT3': 0.0021275867502166,
 'SH3BP1': 2.48509966938038e-07,
 'RASGRP4': 0.428732995942924,
 'SPATA2L': 0.726841491048477,
 'ARID3C': 1.79485645235289e-08,
 'VILL': 0.151633152309346,
 'RFNG': 0.0170041249844035,
 'NLGN2': 0.0001021498007889,
 'SYCE2': 2.80754123416216e-06,
 'PRPH': 0.327855226856989,
 'MKRN1': 0.168268442614802,
 'GPR157': 0.855560270911085,
 'FOXA3': 0.0755701702391637,
 'ZCWPW1': 0.0124331260527298,
 'PRRX2': 0.0011528418555759,
 'LDHD': 1.66024521186525e-06,
 'STARD10': 0.732432555878066,
 'ARSI': 0.0007609686277351,
 'PLVAP': 4.83520655450986e-05,
 'PRSS16':

In [27]:
# sort the dict 
p_value_dict = dict(sorted(p_value_dict.items(), key=lambda item: (isinstance(item[1], str), item[1])))
# p_value_dict
# Convert gene names to 'First char uppercase & rest lowercase' format
p_value_dict = {key.capitalize(): value for key, value in p_value_dict.items()}
df = pd.DataFrame(list(p_value_dict.items()), columns=['Gene', 'P_Value'])
df
df.to_csv('TCGA/standard_notation_p_values.csv', index=False)

In [41]:
df['index'] = df.index
print(df.columns)
df_new = df.drop('P_Value', axis=1)
df_new.to_csv('pval_sorting_index.csv', index=False)

Index(['Gene', 'P_Value', 'index'], dtype='object')


In [20]:
# save a csv of the sorted genes only
sorted_genes = df['Gene']
sorted_genes.to_csv('sorted_genes.csv', index=False)

In [43]:
df

Unnamed: 0,Gene,P_Value,index
0,Slc16a12,3.331843e-09,0
1,Podnl1,8.824178e-09,1
2,Arid3c,1.794857e-08,2
3,Irf6,1.914879e-08,3
4,Myo6,3.203829e-08,4
...,...,...,...
271,Tmem121,9.407352e-01,271
272,Nckap1l,9.492971e-01,272
273,Rtraf,9.664466e-01,273
274,Tacstd2,9.720272e-01,274


In [9]:


# Function to format p-value
def format_p_value(val):
    if val == 0:
        return '0.00'  # Adjust based on how you want to handle exactly 0 values
    elif val < 0.01:  # Adjust this threshold as needed
        return '{:.2e}'.format(val)
    else:
        return '{:.4f}'.format(val) # otherwise standard notation to 4 decimal places

# Load the CSV file
df = pd.read_csv('TCGA/gene_p_values.csv')

# Convert gene names to uppercase for consistency
df['gene_name'] = df['gene_name'].str.upper()

# Apply formatting to p-values
df['p_value'] = df['p_value'].apply(format_p_value)

# Create a dictionary for quick lookup
p_value_dict = df.set_index('gene_name')['p_value'].to_dict()

# Load the HTML content
with open('index.html', 'r') as file:
    html_content = file.read()

soup = BeautifulSoup(html_content, 'html.parser')

# Find the table by ID
table = soup.find('table', {'id': 'functionalScreening'})

# Iterate over each row in the table
for row in table.find_all('tr')[1:]:  # Skip the header row
    cells = row.find_all('td')
    if not cells:
        continue  # Skip header row if not already skipped
    gene_name = cells[0].text.strip().upper()  # Assuming gene name is in the first cell
    p_value = p_value_dict.get(gene_name)
    
    # Clear existing contents of the cell and use NavigableString for the new content
    cells[-1].clear()
    if p_value is not None:
        cells[-1].append(NavigableString(p_value))
    else:
        cells[-1].append(NavigableString("NA"))

# Save the modified HTML
with open('updated_index.html', 'w') as file:
    file.write(str(soup))

print("HTML updated successfully.")


HTML updated successfully.
