In [1]:
import pandas as pd
import re
from Bio import SeqIO

In [2]:
# set the maximum number of columns/rows displayed in the cell
#pd.reset_option('display.max_columns', 20)
#pd.set_option('display.max_rows', 20)
#pd.set_option('display.max_columns', 20)

In [6]:
def filter_TPM(expression_file_path, fold):
    df = pd.read_csv(expression_file_path, sep='\t')
    
    df['TPM_ut_mean'] = (df['TPM'] + df['TPM.1'] + df['TPM.2']) / 3
    df['TPM_ef_mean'] = (df['TPM.6'] + df['TPM.7'] + df['TPM.8']) / 3

    # filter transcripts that have TPM > 1
    df_gt1 = df[df['TPM_ef_mean'] > 1]

    df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']

    # filter transcripts that have TPM fold change ef/ut > fold (= 50)
    high_fold_gene = df_gt1[df_gt1['fold_change'] > fold].sort_values(by='fold_change', ascending=False)
    
    return(high_fold_gene)

In [8]:
def filter_gtf(gtf_file_path):
    # read gtf file into dataframe gtf
    column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
    gtf = pd.read_csv(gtf_file_path, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
    
    # label if each entry has reference gene id
    for i in range(gtf.shape[0]):
        if 'ref_gene_id' in gtf.loc[i,'attribute']:
            gtf.loc[i,'if_ref_id'] = 'T'
        else:
            gtf.loc[i,'if_ref_id'] = 'F'
    
    # extract transcripts without ref_id
    new_transcript = gtf[(gtf['feature'] == 'transcript') & (gtf['if_ref_id'] == 'F')]
    
    return new_transcript

In [9]:
def get_gene_id(input_string):
    # regular expression to catch the content in the quotes after 'gene_id'
    pattern = r'gene_id\s+"([^"]+)"'
    
    # try to find the pattern
    match = re.search(pattern, input_string)

    # if find the pattern, return the content in the quotes
    if match:
        return(match.group(1))
    return 'None'

In [10]:
def screen_data(high_fold_gene, new_transcript):
    # create a new dataframe for screened data
    df_new = pd.DataFrame(columns=high_fold_gene.columns)

    # check every high fold-change gene if it's contained in new_transcript list
    for i in high_fold_gene.index:
        for j in new_transcript.index[:]:
            if high_fold_gene.loc[i,'Gene ID'] == get_gene_id(new_transcript.loc[j,'attribute']):
                # add the line containg gene info to the dataframe
                df_new = pd.concat([df_new, high_fold_gene.loc[i, :].to_frame().transpose()], ignore_index=True)

    # remove the replicates
    df_new = df_new.drop_duplicates()
    
    return(df_new)

In [4]:
def run(expression_file_path, gtf_file_path, fold):
    high_fold_gene = filter_TPM(expression_file_path, fold)
    new_transcript = filter_gtf(gtf_file_path)
    df_new = screen_data(high_fold_gene, new_transcript)
    
    return df_new

In [11]:
expression_file_path = 'ef_merged_expression/Dmel_comb.txt'
gtf_file_path = 'ef_merged_gtf/Dmel_merged.gtf'
df_dmel = run(expression_file_path, gtf_file_path, 10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


In [13]:
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
gtf = pd.read_csv(gtf_file_path, sep='\t', comment='#', header=None, names=column_names, low_memory=False)

In [27]:
expression_file_path = 'ef_merged_expression/Sleb_comb.txt'
gtf_file_path = 'ef_merged_gtf/Sleb_merged.gtf'
df_sleb = run(expression_file_path, gtf_file_path, 10)

KeyError: 'TPM_ut1'

In [20]:
# this function extracts transcript seuqences of genes of interest from a fasta file containing all transcripts
# input is transcript sequences file path, a gene list, and output file path
def extract_sequences(transcript_file_path, gene_list, output_file_path):
    
    sequences = []

    for record in SeqIO.parse(transcript_file_path, "fasta"):
        # convert transcript_id to gene_id by removing the content after last '.'
        parts = record.id.rsplit('.', 1)
        gene_id = parts[0] if len(parts) > 1 else record.id

        if gene_id in gene_list:
            sequences.append(record)

    with open(output_file_path, "w") as output_handle:
        SeqIO.write(sequences, output_handle, "fasta")

In [277]:
transcript_file_path = 'sleb_m_transcript.fa'
output_file_path = 'sleb_new_trasncript_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

In [None]:
gtf_file_path = 'GCF_003285725.1_SlebRS2_genomic.gff'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
df = pd.read_csv(gtf_file_path, sep='\t', comment='#', header=None, names=column_names, low_memory=False)

ref_gtf = df[df['feature']=='exon']

In [21]:
# check if two features(exons/transcripts/genes) have overlapping regions by comparing their start/end locations
# return True if overlapped
def check_overlap(feature1, feature2):
    return feature1['Start'] <= feature2['end'] and feature1['End'] >= feature2['start']

def add_overlap_info(df_in,ref_gtf):
    output = pd.DataFrame()
    # create a dictionary saving the gtf for each contig
    ref_gtf_contig = {}
    
    for i in df_in.index:
        df_in.loc[i,'if_overlap'] = 'No'
        contig = df_in.loc[i,'Reference']
        
        # each key is contig name, each value the gtf info for the contig
        if contig not in ref_gtf_contig.keys():
            ref_gtf_contig[contig] = ref_gtf[ref_gtf['seqname'] == contig].sort_values(by='start', ascending=True)
        for j in ref_gtf_contig[contig].index:
            if ref_gtf.loc[j,'start'] > df_in.loc[i,'End']:
                break
            # check if the transcript has any overlap regions with exisiting exons
            if check_overlap(df_in.loc[i,:],ref_gtf.loc[j,:]):
                start = max(df_in.loc[i,'Start'],ref_gtf.loc[j,'start'])
                end = min(df_in.loc[i,'End'],ref_gtf.loc[j,'end'])
                overlap_length = end - start
                length = df_in.loc[i,'End'] - df_in.loc[i,'Start']
                percent = overlap_length / length

                # if it's the first overlap region found, update it
                if df_in.loc[i,'if_overlap'] == 'No':
                    df_in.loc[i,'if_overlap'] = 'Yes'
                    df_in.loc[i,'%overlap'] = percent
                # if not the first, update with the larger overlap region
                else:
                    if df_in.loc[i,'%overlap'] < percent:
                        df_in.loc[i,'%overlap'] = percent
    return df_in

In [None]:
test_sleb = df_sleb.loc[df_filter.index,:]
test_sleb = test_sleb.sort_values(by='Start', ascending=True)
added_df_in = add_overlap_info(test_sleb, ref_gtf)

In [82]:
added_df_in[(added_df_in['if_overlap'] != 'Yes') | (added_df_in['%overlap'] < 0.5)]

Unnamed: 0,Gene ID,Gene Name,Reference,Strand,Start,End,Coverage,FPKM,TPM,FPKM.1,...,TPM.6,FPKM.7,TPM.7,FPKM.8,TPM.8,TPM_ut_mean,TPM_pr_mean,fold_change,if_overlap,%overlap
291,MSTRG.6046,-,NW_022060550.1,.,61228,61556,5.048632,1.757874,3.506197,1.413304,...,8.452106,6.702477,13.094912,5.676188,11.449466,3.861828,10.998828,2.848088,No,
468,MSTRG.12928,-,NW_022060742.1,+,232477,236036,7.945543,4.649685,9.274108,4.451277,...,15.114046,10.657976,20.822937,9.418137,18.99737,8.5465,18.311451,2.142567,Yes,0.193875
508,MSTRG.11231,-,NW_022060621.1,-,303865,306706,2.167556,0.786978,1.56968,2.602611,...,7.628936,2.818815,5.507238,2.821084,5.690422,3.103453,6.275532,2.022113,No,
328,MSTRG.2502,-,NW_022060538.1,.,354510,355653,0.153846,0.053567,0.106844,3.656264,...,13.107974,2.038422,3.98255,2.926721,5.903502,2.925438,7.664675,2.620009,No,
3,MSTRG.11019,-,NW_022060600.1,.,566404,566732,0.0,0.0,0.0,0.0,...,47.595459,0.0,0.0,0.0,0.0,0.0,15.865153,inf,No,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,MSTRG.1588,-,NW_022060529.1,.,18391518,18391821,18.536184,6.45408,12.873096,144.664841,...,6260.32373,3970.977783,7758.266113,0.0,0.0,97.576414,4672.863281,47.88927,No,
361,MSTRG.1943,-,NW_022060529.1,-,22186799,22217630,11.086722,7.554018,15.066996,8.427059,...,28.848961,27.170868,53.084869,15.257482,30.775938,15.239915,37.569923,2.465232,Yes,0.052155
106,MSTRG.4005,-,NW_022060538.1,+,23033946,23036132,1.401538,0.532584,1.062275,0.481346,...,4.362121,6.606958,12.908292,3.466725,6.992747,0.918917,8.08772,8.801361,Yes,0.232845
61,MSTRG.2269,.,NW_022060529.1,.,27209526,27209781,0.0,0.0,0.0,7.352944,...,1058.042847,750.417053,1466.12146,550.123291,1109.656372,4.774623,1211.27356,253.689867,No,


In [104]:
added_df_in[added_df_in['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

Unnamed: 0,Gene ID,Gene Name,Reference,Strand,Start,End,Coverage,FPKM,TPM,FPKM.1,...,TPM.6,FPKM.7,TPM.7,FPKM.8,TPM.8,TPM_ut_mean,TPM_pr_mean,fold_change,if_overlap,%overlap
3,MSTRG.11019,-,NW_022060600.1,.,566404,566732,0.0,0.0,0.0,0.0,...,47.595459,0.0,0.0,0.0,0.0,0.0,15.865153,inf,No,
33,MSTRG.8439,-,NW_022060566.1,.,2721671,2722513,0.0,0.0,0.0,0.0,...,5.188394,3.751498,7.32946,3.622036,7.306027,0.0,6.60796,inf,No,
61,MSTRG.2269,.,NW_022060529.1,.,27209526,27209781,0.0,0.0,0.0,7.352944,...,1058.042847,750.417053,1466.12146,550.123291,1109.656372,4.774623,1211.27356,253.689867,No,
70,MSTRG.1588,-,NW_022060529.1,.,18391518,18391821,18.536184,6.45408,12.873096,144.664841,...,6260.32373,3970.977783,7758.266113,0.0,0.0,97.576414,4672.863281,47.88927,No,
88,MSTRG.4616,-,NW_022060545.1,.,7870440,7870931,1.888211,0.657453,1.311334,45.388313,...,0.0,0.0,0.0,688.548279,1388.874146,29.122767,462.958049,15.896774,No,
220,MSTRG.2516,-,NW_022060538.1,.,655049,655722,0.0,0.0,0.0,0.0,...,97.198975,49.425846,96.565361,41.819115,84.353539,24.563093,92.705958,3.774197,No,
228,MSTRG.13746,-,NW_022060742.1,-,12738711,12750714,2.963251,1.031769,2.057932,1.329909,...,5.184815,3.801761,7.42766,3.653596,7.369686,1.833778,6.66072,3.632239,No,
252,MSTRG.11598,-,NW_022060738.1,+,1820405,1820902,4.790068,1.667845,3.326629,2.613643,...,18.664377,7.19315,14.05356,3.428897,6.916446,4.003816,13.211461,3.299717,No,
291,MSTRG.6046,-,NW_022060550.1,.,61228,61556,5.048632,1.757874,3.506197,1.413304,...,8.452106,6.702477,13.094912,5.676188,11.449466,3.861828,10.998828,2.848088,No,
323,MSTRG.7930,-,NW_022060564.1,.,5152968,5153611,11.07764,3.857103,7.693251,6.31877,...,32.050354,15.168712,29.635752,12.087071,24.38089,10.853178,28.688999,2.643373,No,


In [30]:
## Sleb
expression_file_path_sleb = 'ef_merged_expression/Sleb_comb.txt'
gtf_file_path_sleb = 'ef_merged_gtf/Sleb_merged.gtf'
df_sleb = run(expression_file_path_sleb, gtf_file_path_sleb, 2)

# filter for foldchange > 2, meanTPM of  > 5, and don't contain LOC ref gene_id
# .iloc[:,[0,4,5,8,10,12,14,16,18,20,22,24,25,26,27]]
df_filter_sleb = df_sleb[((df_sleb['fold_change'] > 2) & (df_sleb['TPM_ef_mean'] > 5) ) & (~df_sleb['Gene Name'].str.contains('LOC'))]
df_filter_sleb.to_csv('sleb_ef5_fold2_noref.csv')

gtf_file_path_sleb = 'GCF_003285725.1_SlebRS2_genomic.gff'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
ref_df_sleb = pd.read_csv(gtf_file_path_sleb, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
ref_gtf_sleb = ref_df_sleb[ref_df_sleb['feature']=='exon']

test_sleb = df_sleb.loc[df_filter_sleb.index,:].sort_values(by='Start', ascending=True)
added_df_in_sleb = add_overlap_info(test_sleb, ref_gtf_sleb)

# filter for df_filter and no overlap regions with existing exons
added_df_in_sleb_nooverlap = added_df_in_sleb[added_df_in_sleb['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

# .iloc[:,[0,8,10,12,14,16,18,20,22,24,26,27]]
added_df_in_sleb_nooverlap.to_csv('sleb_ef5_fold2_noref_nooverlap.csv')

transcript_file_path = 'Sleb_m_transcript.fa'
df_filter_list = added_df_in_sleb_nooverlap['Gene ID'].to_list()
output_file_path = 'sleb_ef5_fold2_noref_nooverlap_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


In [31]:
expression_file_path = 'ef_merged_expression/Sleb_comb.txt'
df = pd.read_csv(expression_file_path, sep='\t')

In [33]:
## Dvir
expression_file_path_dvir = 'ef_merged_expression/Dvir_comb.txt'
gtf_file_path_dvir = 'ef_merged_gtf/Dvir_merged.gtf'
df_dvir = run(expression_file_path_dvir, gtf_file_path_dvir, 2)

# filter for foldchange > 2, meanTPM of ef > 5, and don't contain LOC ref gene_id
# .iloc[:,[0,4,5,8,10,12,14,16,18,20,22,24,25,26,27]]
df_filter_dvir = df_dvir[((df_dvir['fold_change'] > 2) & (df_dvir['TPM_ef_mean'] > 5) ) & (~df_dvir['Gene Name'].str.contains('LOC'))]
df_filter_dvir.to_csv('dvir_ef5_fold2_noref.csv')

gtf_file_path_dvir = 'GCF_003285735.1_DvirRS2_genomic.gff'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
ref_df_dvir = pd.read_csv(gtf_file_path_dvir, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
ref_gtf_dvir = ref_df_dvir[ref_df_dvir['feature']=='exon']

test_dvir = df_dvir.loc[df_filter_dvir.index,:].sort_values(by='Start', ascending=True)
added_df_in_dvir = add_overlap_info(test_dvir, ref_gtf_dvir)

# filter for df_filter and no overlap regions with existing exons
added_df_in_dvir_nooverlap = added_df_in_dvir[added_df_in_dvir['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

# .iloc[:,[0,8,10,12,14,16,18,20,22,24,26,27]]
added_df_in_dvir_nooverlap.to_csv('dvir_ef5_fold2_noref_nooverlap.csv')

transcript_file_path = 'Dvir_m_transcript.fa'
df_filter_list = added_df_in_dvir_nooverlap['Gene ID'].to_list()
output_file_path = 'dvir_ef5_fold2_noref_nooverlap_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


In [34]:
## Dana
expression_file_path_dana = 'ef_merged_expression/Dana_comb.txt'
gtf_file_path_dana = 'ef_merged_gtf/Dana_merged.gtf'
df_dana = run(expression_file_path_dana, gtf_file_path_dana, 2)

# filter for foldchange > 2, meanTPM of ef > 5, and don't contain LOC ref gene_id
# .iloc[:,[0,4,5,8,10,12,14,16,18,20,22,24,25,26,27]]
df_filter_dana = df_dana[((df_dana['fold_change'] > 2) & (df_dana['TPM_ef_mean'] > 5) ) & (~df_dana['Gene Name'].str.contains('LOC'))]
df_filter_dana.to_csv('dana_ef5_fold2_noref.csv')

gtf_file_path_dana = 'GCF_017639315.1_ASM1763931v2_genomic.gff'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
ref_df_dana = pd.read_csv(gtf_file_path_dana, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
ref_gtf_dana = ref_df_dana[ref_df_dana['feature']=='exon']

test_dana = df_dana.loc[df_filter_dana.index,:].sort_values(by='Start', ascending=True)
added_df_in_dana = add_overlap_info(test_dana, ref_gtf_dana)

# filter for df_filter and no overlap regions with existing exons
added_df_in_dana_nooverlap = added_df_in_dana[added_df_in_dana['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

# .iloc[:,[0,8,10,12,14,16,18,20,22,24,26,27]]
added_df_in_dana_nooverlap.to_csv('dana_ef5_fold2_noref_nooverlap.csv')

print(df_filter_dana.shape, added_df_in_dana_nooverlap.shape)

transcript_file_path = 'Dana_m_transcript.fa'
df_filter_list = added_df_in_dana_nooverlap['Gene ID'].to_list()
output_file_path = 'dana_ef5_fold2_noref_nooverlap_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


(26, 28) (4, 30)


In [35]:
## dsim
expression_file_path_dsim = 'ef_merged_expression/Dsim_comb.txt'
gtf_file_path_dsim = 'ef_merged_gtf/Dsim_merged.gtf'
df_dsim = run(expression_file_path_dsim, gtf_file_path_dsim, 2)

# filter for foldchange > 2, meanTPM of ef > 5, and don't contain LOC ref gene_id
# .iloc[:,[0,4,5,8,10,12,14,16,18,20,22,24,25,26,27]]
df_filter_dsim = df_dsim[((df_dsim['fold_change'] > 2) & (df_dsim['TPM_ef_mean'] > 5) ) & (~df_dsim['Gene Name'].str.contains('LOC'))]
df_filter_dsim.to_csv('dsim_ef5_fold2_noref.csv')

gtf_file_path_dsim = 'GCF_016746395.2_Prin_Dsim_3.1_genomic.gff'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
ref_df_dsim = pd.read_csv(gtf_file_path_dsim, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
ref_gtf_dsim = ref_df_dsim[ref_df_dsim['feature']=='exon']

test_dsim = df_dsim.loc[df_filter_dsim.index,:].sort_values(by='Start', ascending=True)
added_df_in_dsim = add_overlap_info(test_dsim, ref_gtf_dsim)

# filter for df_filter and no overlap regions with existing exons
added_df_in_dsim_nooverlap = added_df_in_dsim[added_df_in_dsim['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

# .iloc[:,[0,8,10,12,14,16,18,20,22,24,26,27]]
added_df_in_dsim_nooverlap.to_csv('dsim_ef5_fold2_noref_nooverlap.csv')

print(df_filter_dsim.shape, added_df_in_dsim_nooverlap.shape)

transcript_file_path = 'Dsim_m_transcript.fa'
df_filter_list = added_df_in_dsim_nooverlap['Gene ID'].to_list()
output_file_path = 'dsim_ef5_fold2_noref_nooverlap_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


(32, 28) (13, 30)


In [36]:
## dmel
expression_file_path_dmel = 'ef_merged_expression/Dmel_comb.txt'
gtf_file_path_dmel = 'ef_merged_gtf/Dmel_merged.gtf'
df_dmel = run(expression_file_path_dmel, gtf_file_path_dmel, 2)

# filter for foldchange > 2, meanTPM of ef > 5, and don't contain LOC ref gene_id
# .iloc[:,[0,4,5,8,10,12,14,16,18,20,22,24,25,26,27]]
df_filter_dmel = df_dmel[((df_dmel['fold_change'] > 2) & (df_dmel['TPM_ef_mean'] > 5) ) & (~df_dmel['Gene Name'].str.contains('FBgn'))]
df_filter_dmel.to_csv('dmel_ef5_fold2_noref.csv')

gtf_file_path_dmel = 'dmel-all-r6.48.gtf'
column_names = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
ref_df_dmel = pd.read_csv(gtf_file_path_dmel, sep='\t', comment='#', header=None, names=column_names, low_memory=False)
ref_gtf_dmel = ref_df_dmel[ref_df_dmel['feature']=='exon']


test_dmel = df_dmel.loc[df_filter_dmel.index,:].sort_values(by='Start', ascending=True)
added_df_in_dmel = add_overlap_info(test_dmel, ref_gtf_dmel)

# filter for df_filter and no overlap regions with existing exons
added_df_in_dmel_nooverlap = added_df_in_dmel[added_df_in_dmel['if_overlap'] == 'No'].sort_values(by='fold_change', ascending=False)

# .iloc[:,[0,8,10,12,14,16,18,20,22,24,26,27]]
added_df_in_dmel_nooverlap.to_csv('dmel_ef5_fold2_noref_nooverlap.csv')

print(df_filter_dmel.shape, added_df_in_dmel_nooverlap.shape)

transcript_file_path = 'Dmel_m_transcript.fa'
df_filter_list = added_df_in_dmel_nooverlap['Gene ID'].to_list()
output_file_path = 'dmel_ef5_fold2_noref_nooverlap_seq.fasta'
extract_sequences(transcript_file_path, df_filter_list, output_file_path)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_gt1['fold_change'] = df_gt1['TPM_ef_mean'] / df_gt1['TPM_ut_mean']


(88, 28) (34, 30)
