# KDM5 Protein Family Analysis
This is my attempt to investigate KDM5 protein sequences using python.

The following website is very helpful.  
http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc40

In [2]:
#Load uniprot Ids for the KDM5 family, square brackets specify a 'list'
KDM5s = ["Q3UXZ9","Q38JA7","Q30DN6","Q62240","Q5F3R2","Q80Y84","A1YVX4","Q61T02","Q9VMJ7","P41228","P41229","Q5XUN4","Q23541","P29375","Q9UGL1","P41230","Q9BY66","Q6IQX0","P47156","Q8GUI6"]


In [3]:
#Aim: Get peptide sequences and descriptions for each id using Biopython or 'Bio' as it is sometimes confusingly called.


In [4]:
from Bio import ExPASy
from Bio import SwissProt

#Define the input accession numbers
accessions = KDM5s
#Assign a 'list' of 'records' to save the output.
records = []

for accession in accessions:
    #Get the data from the ExPASy website.
    handle = ExPASy.get_sprot_raw(accession)
    #Parse the data into a python accessible 'record'.
    record = SwissProt.read(handle)
    #Add the record to the list.
    records.append(record)

In [5]:
#You can view the list of records (uncomment below) but it is not useful.
#records

In [6]:
#This is how to query 'records'
#http://biopython.org/DIST/docs/api/Bio.SwissProt.Record-class.html
for record in records:
    print(record.entry_name)
    print(",".join(record.accessions))
    print(record.keywords)
    print(repr(record.organism))
    print(record.sequence[:20] + "...")

KDM5A_MOUSE
Q3UXZ9,Q3TM94,Q3UMI5,Q66JZ3
['Activator', 'Biological rhythms', 'Chromatin regulator', 'Complete proteome', 'Developmental protein', 'Dioxygenase', 'Direct protein sequencing', 'Iron', 'Isopeptide bond', 'Metal-binding', 'Nucleus', 'Oxidoreductase', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']
'Mus musculus (Mouse).'
MASVGPGGYAAEFVPPPECP...
KDM5C_CANLF
Q38JA7
['Biological rhythms', 'Chromatin regulator', 'Complete proteome', 'Dioxygenase', 'Iron', 'Isopeptide bond', 'Metal-binding', 'Nucleus', 'Oxidoreductase', 'Phosphoprotein', 'Reference proteome', 'Repeat', 'Repressor', 'Transcription', 'Transcription regulation', 'Ubl conjugation', 'Zinc', 'Zinc-finger']
'Canis lupus familiaris (Dog) (Canis familiaris).'
MEPGSDDFLPPPECPVFEPS...
KDM5D_CANLF
Q30DN6
['Chromatin regulator', 'Complete proteome', 'Dioxygenase', 'Iron', 'Isopeptide bond', 'Metal-binding', 'Nucleus', 'Oxidoreductase', 'P

In [7]:
##Now I need to figure out how to query domain positions
for record in records:
    print(record.entry_name)
    print(record.sequence[:100] + "...")

KDM5A_MOUSE
MASVGPGGYAAEFVPPPECPVFEPSWEEFTDPLSFIGRIRPFAEKTGICKIRPPKDWQPPFACEVKTFRFTPRVQRLNELEAMTRVRLDFLDQLAKFWEL...
KDM5C_CANLF
MEPGSDDFLPPPECPVFEPSWAEFRDPLGYIAKIRPIAEKSGICKIRPPADWQPPFAVEVDNFRFTPRIQRLNELEAQTRVKLNYLDQIAKFWEIQGSSL...
KDM5D_CANLF
MESGSDDFLPPPECPVFEPTWAEFRDPLDYITKIRPIAEKSGICKIRPPADWQPPFAVEVDNFRFTPRIQRLNELEAQTRVKLNYLDQIAKFWEIQGSSL...
KDM5D_MOUSE
MKPGSDDFLPPPECPVFEPSWAEFRDPLGYIAKIRPIAEKSGICKIRPPADWQPPFAVEVDNFRFTPRIQRLNELEAQTRVKLNYLDQIAKFWEIQGSSL...
KDM5B_CHICK
MAEFLPPPECPVFEPSWEEFADPFAFIHKIRPIAEQTGICKVRPPPDWQPPFACDVDKLHFTPRIQRLNELEAQTRVKLNFLDQIAKFWELQGCTLKIPH...
KDM5B_MOUSE
MEPATTLPPGPRPALPLGGPGPLGEFLPPPECPVFEPSWEEFADPFAFIHKIRPIAEQTGICKVRPPPDWQPPFACDVDKLHFTPRIQRLNELEAQTRVK...
KDM5C_PIG
MEPGSDDFLPPPECPVFEPSWAEFRDPLGYIAKIRPIAEKSGICKIRPPADWQPPFAVEVDNFRFTPRIQRLNELEIVVEEGGYEAICKDRRWARVAQRL...
KDM5_CAEBR
MRGRRQEDIATTSSAPSTSTSHKKKTVSSNGSFRPRTQSNPGGKMEMYDHFYKNFQRPPMAPVYYPTSEEFADPIEYVAKIRPDAERYGVVKIVPPSDFK...
KDM5_DROME
MSAKTEADNTTAANSGGGGVGSGTSSGGGASANGTATPARRLRTRNSTGNGTNSGSESVKKSNA

Here is a list of the 'record' attributes

Attributes:
- entry_name Name of this entry, e.g. RL1_ECOLI.
- data_class Either 'STANDARD' or 'PRELIMINARY'.
- molecule_type Type of molecule, 'PRT',
- sequence_length Number of residues.
- accessions List of the accession numbers, e.g. ['P00321']
- created A tuple of (date, release).
- sequence_update A tuple of (date, release).
- annotation_update A tuple of (date, release).
- description Free-format description.
- gene_name Gene name. See userman.txt for description.
- organism The source of the sequence.
- organelle The origin of the sequence.
- organism_classification The taxonomy classification. List of strings. (http://www.ncbi.nlm.nih.gov/Taxonomy/)
- taxonomy_id A list of NCBI taxonomy id's.
- host_organism A list of names of the hosts of a virus, if any.
- host_taxonomy_id A list of NCBI taxonomy id's of the hosts, if any.
- references List of Reference objects.
- comments List of strings.
- cross_references List of tuples (db, id1[, id2][, id3]). See the docs.
- keywords List of the keywords.
- features List of tuples (key name, from, to, description). from and to can be either integers for the residue numbers, '<', '>', or '?'
- protein_existence Numerical value describing the evidence for the existence of the protein.
- seqinfo tuple of (length, molecular weight, CRC32 value)
- sequence The sequence.

In [8]:
for record in records:
    print(record.entry_name)
    print(record.features)

KDM5A_MOUSE
[('CHAIN', 1, 1690, 'Lysine-specific demethylase 5A.', 'PRO_0000292411'), ('DOMAIN', 19, 60, 'JmjN. {ECO:0000255|PROSITE- ProRule:PRU00537}.', ''), ('DOMAIN', 84, 174, 'ARID. {ECO:0000255|PROSITE- ProRule:PRU00355}.', ''), ('DOMAIN', 437, 603, 'JmjC. {ECO:0000255|PROSITE- ProRule:PRU00538}.', ''), ('ZN_FING', 293, 343, 'PHD-type 1. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('ZN_FING', 676, 728, 'C5HC2. {ECO:0000250|UniProtKB:P29375}.', ''), ('ZN_FING', 1153, 1210, 'PHD-type 2. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('ZN_FING', 1599, 1653, 'PHD-type 3. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('REGION', 1622, 1690, 'Interaction with LMO2. {ECO:0000250|UniProtKB:P29375}.', ''), ('MOTIF', 419, 423, 'GSGFP motif. {ECO:0000269|PubMed:20064375}.', ''), ('COMPBIAS', 1484, 1579, 'Lys-rich.', ''), ('METAL', 483, 483, 'Iron; catalytic. {ECO:0000255|PROSITE- ProRule:PRU00538}.', ''), ('METAL', 485, 485, 'Iron; catalytic. {ECO:0000250|UniProtKB:P29375}.', ''), ('M

In [9]:
    print(records[0].features)

[('CHAIN', 1, 1690, 'Lysine-specific demethylase 5A.', 'PRO_0000292411'), ('DOMAIN', 19, 60, 'JmjN. {ECO:0000255|PROSITE- ProRule:PRU00537}.', ''), ('DOMAIN', 84, 174, 'ARID. {ECO:0000255|PROSITE- ProRule:PRU00355}.', ''), ('DOMAIN', 437, 603, 'JmjC. {ECO:0000255|PROSITE- ProRule:PRU00538}.', ''), ('ZN_FING', 293, 343, 'PHD-type 1. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('ZN_FING', 676, 728, 'C5HC2. {ECO:0000250|UniProtKB:P29375}.', ''), ('ZN_FING', 1153, 1210, 'PHD-type 2. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('ZN_FING', 1599, 1653, 'PHD-type 3. {ECO:0000255|PROSITE- ProRule:PRU00146}.', ''), ('REGION', 1622, 1690, 'Interaction with LMO2. {ECO:0000250|UniProtKB:P29375}.', ''), ('MOTIF', 419, 423, 'GSGFP motif. {ECO:0000269|PubMed:20064375}.', ''), ('COMPBIAS', 1484, 1579, 'Lys-rich.', ''), ('METAL', 483, 483, 'Iron; catalytic. {ECO:0000255|PROSITE- ProRule:PRU00538}.', ''), ('METAL', 485, 485, 'Iron; catalytic. {ECO:0000250|UniProtKB:P29375}.', ''), ('METAL', 571, 

In [10]:
print(records[0].annotations)

AttributeError: 'Record' object has no attribute 'annotations'

In [21]:
#Convert record to SeqRecord
from Bio import SeqIO
seq_records= SeqIO.parse(records, "swiss")

In [27]:
#from Bio import SeqIO
seq_records = list(SeqIO.parse(records, "swiss"))

print("Found %i records" % len(seq_records))


ValueError: Unknown keyword '<B' found

In [23]:
 for seq_record in seq_records:
    print(seq_record.id)
    print(repr(seq_record.seq))
    print(len(seq_record))

ValueError: Unknown keyword '<B' found

In [25]:
print(seq_records[0].seq)

TypeError: 'generator' object has no attribute '__getitem__'

In [None]:
#Try aligning sequences