### Load and read the Human GTF reference file in the Kernel

In [54]:

with open('Homo_sapiens.GRCh38.110.chr.gtf', 'rb') as f:
    content = f.read()

### Split the gtf file based on each line

In [55]:
lines = content.splitlines()

## From GTF to CSV

In [60]:
import csv

# Read the GTF file
gtf_filename = 'Homo_sapiens.GRCh38.110.chr.gtf'
csv_filename = 'human_gtf.csv'

# Define the fields we want to extract from the GTF file
fields = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']

# Open the GTF file for reading and the CSV file for writing
with open(gtf_filename, 'r') as gtf_file, open(csv_filename, 'w', newline='') as csv_file:
    gtf_reader = csv.reader(gtf_file, delimiter='\t')
    csv_writer = csv.writer(csv_file)

    # Write CSV header
    csv_writer.writerow(fields)

    for line in gtf_reader:
        if not line or line[0].startswith('#'):
            continue

        # Extract relevant fields from the GTF line
        data = {
            'seqname': line[0],
            'source': line[1],
            'feature': line[2],
            'start': line[3],
            'end': line[4],
            'score': line[5],
            'strand': line[6],
            'frame': line[7],
            'attribute': line[8]
        }

        # Write the extracted data to the CSV file
        csv_writer.writerow([data[field] for field in fields])


### Read the gtf in csv

In [63]:
import pandas as pd
gtf = pd.read_csv('human_gtf.csv', low_memory = False)

In [64]:
gtf.head()

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
0,1,havana,gene,182696,184174,.,+,.,"gene_id ""ENSG00000279928""; gene_version ""2""; g..."
1,1,havana,transcript,182696,184174,.,+,.,"gene_id ""ENSG00000279928""; gene_version ""2""; t..."
2,1,havana,exon,182696,182746,.,+,.,"gene_id ""ENSG00000279928""; gene_version ""2""; t..."
3,1,havana,exon,183132,183216,.,+,.,"gene_id ""ENSG00000279928""; gene_version ""2""; t..."
4,1,havana,exon,183494,183571,.,+,.,"gene_id ""ENSG00000279928""; gene_version ""2""; t..."


### Extract the transcript Id from the attribute columns using lambda function 

In [77]:
gtf['transcript_id']  = gtf['attribute'].apply(lambda x: x.split(';')[2].split('"')[1])

### Extact gene Id from the attribute using lambda function

In [94]:
gtf['gene_id'] = gtf['attribute'].apply(lambda x : x.split(';')[0].split('"')[1])

### To get the transcript data from the whole gtf data 

In [122]:
transcript = gtf[gtf['feature'] == 'transcript' ]

### Add a new column called total bases in the transcript dataset

In [123]:
transcript['total_bases'] = (transcript['end'] - transcript['start'] + 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  transcript['total_bases'] = (transcript['end'] - transcript['start'] + 1)


### Save the the transcript reference data into csv format in the local memory

In [131]:
transcript.to_csv('transcript_human_ref.csv')