In [1]:
from fasta import *

In [2]:
# Load the 2U1 files
psi_blast_records = SeqIO.parse("files/candidates/2U1_BLAST.fasta", "fasta")

In [3]:
# Getting unique records for the 2U1_all_candidates BLAST
record_dict = {}
count = 0
for record in psi_blast_records:
    record_dict[record.id] = record
    count +=1
print ("Length of original 2U1 all candidates was", count)
record_list = record_dict.values()
print ("Length of unique records from 2U1 all candidates was", len(record_list))

Length of original 2U1 all candidates was 100000
Length of unique records from 2U1 all candidates was 24665


In [4]:
SeqIO.write(record_list, "files/candidates/2U1_BLAST_unique.fasta", "fasta")

24665

In [6]:
# Check to see if there are records we find in some of the PSI-BLAST results that we don't find in others

marmota_marmota = SeqIO.to_dict(SeqIO.parse("files/candidates/Marmota_marmota.fasta", "fasta"))
poecillia_reticula = SeqIO.to_dict(SeqIO.parse("files/candidates/Poecilia_reticulata.fasta", "fasta"))
python_bivittatus = SeqIO.to_dict(SeqIO.parse("files/candidates/Python_bivittatus.fasta", "fasta"))
amazona_aestiva = SeqIO.to_dict(SeqIO.parse("files/candidates/Amazona_aestiva.fasta", "fasta"))
andrias_davidianus = SeqIO.to_dict(SeqIO.parse("files/candidates/Andrias_davidianus.fasta", "fasta"))

def getSet(records):
    id_set = set()
    for record in records:
        id_set.add(record)
    return id_set

mammal = getSet(marmota_marmota)
fish = getSet(poecillia_reticula)
amphibian = getSet(andrias_davidianus)
bird = getSet(amazona_aestiva)
reptile = getSet(python_bivittatus)


ValueError: Duplicate key 'XP_009205588.1'

In [8]:
print (bird - (amphibian | reptile | mammal | fish))
print (amphibian - ( bird | reptile | mammal | fish))
print (reptile - (amphibian | bird | mammal | fish))
print (mammal - (amphibian | reptile | bird | fish))
print (fish - (amphibian | reptile | mammal | bird))

print (fish - mammal)
print (fish - bird)
print (fish - reptile)

print ( "There are %s unique records not found if we start with fish" % ( len((mammal - fish) | (reptile - fish) | (bird - fish) | (amphibian - fish))))
print (  "There is %s unique records found if we start with bird" % (len((mammal - bird) | (reptile - bird) | (fish - bird) | (amphibian - bird))))
print (  "There are %s unique records found if we start with amphibian" % (len((mammal - amphibian) | (reptile - amphibian) | (bird - amphibian) | (fish - amphibian))))
print ( "There is %s unique records found if we start with reptile" % ( len((mammal - reptile) | (fish - reptile) | (bird - reptile) | (amphibian - reptile))))
print ( "There is %s unique records not found if we start with mammal" % (len((fish - mammal) | (reptile - mammal) | (bird - mammal) | (amphibian - mammal))))

set()
set()
set()
set()
set()
{'ETE55910.1'}
{'ETE55910.1'}
{'ETE55910.1'}
There are 0 unique records not found if we start with fish
There is 1 unique records found if we start with bird
There are 0 unique records found if we start with amphibian
There is 1 unique records found if we start with reptile
There is 1 unique records not found if we start with mammal


## Now lets check if the large PSI-BLAST has missing records from the smaller PSI-BLAST and my expanded BLAST search

In [45]:
# Check 

psi_blast_records = SeqIO.parse("files/2U1_all_candidates_PSI_BLAST.fasta", "fasta")

full_psi_blast_records = SeqIO.parse("files/full_PSI_filtered_records.fasta", "fasta")

expanded_blast_records = SeqIO.parse("files/filtered_records.fasta", "fasta")


In [46]:
def getSetFromRecords(records):
    id_set = set()
    for record in records:
        id_set.add(record.id)
    return id_set

# Made by doing 
psi_blast_set = getSetFromRecords(psi_blast_records)
full_psi_blast_set = getSetFromRecords(full_psi_blast_records)
expanded_blast_set = getSetFromRecords(expanded_blast_records)

In [60]:
print ("Length of PSI BLAST search from difference candidates restricted to 2U1 and 2U1-like sequences:", len(psi_blast_set))
print()
print ("Length of full PSI_BLAST search from Python not restricted to 2U1 and 2U1-like sequences:", len(full_psi_blast_set))
print()
print ("Length of original expanded BLAST set made from 5 candidates and restricted to 2U1 and 2U1-like:", len(expanded_blast_set))

Length of PSI BLAST search from difference candidates restricted to 2U1 and 2U1-like sequences: 399

Length of full PSI_BLAST search from Python not restricted to 2U1 and 2U1-like sequences: 355

Length of original expanded BLAST set made from 5 candidates and restricted to 2U1 and 2U1-like: 420


In [52]:
print ("The following sequences are only in the PSI_BLAST search from different candidates restricted to 2U1 and 2U1-like sequences")
print (psi_blast_set - full_psi_blast_set - expanded_blast_set)

The following sequences are only in the PSI_BLAST search from different candidates restricted to 2U1 and 2U1-like sequences
{'PIO25243.1', 'XP_007648453.1', 'KFU94516.1', 'KFP23895.1', 'XP_013914382.1', 'XP_013040816.1', 'KFP67630.1', 'XP_019783338.1', 'XP_012955331.1', 'XP_010002414.1', 'OWA52865.1', 'XP_014593439.1', 'XP_008936534.1', 'OWA52871.1', 'KFU98834.1', 'KFO07982.1', 'KDR06611.1', 'XP_014141091.1', 'OMJ26238.1', 'KFR08560.1', 'XP_013040818.1', 'KFQ27801.1', 'XP_012510559.1', 'XP_009068953.1', 'XP_009969653.1', 'PIK55290.1', 'OQV16873.1', 'KFO60610.1', 'CUA67356.1', 'XP_005418305.2', 'OQV17437.1', 'EHL00994.1', 'ETE55910.1', 'XP_013155162.1', 'PIK57040.1', 'KFZ45339.1', 'ODM94942.1', 'XP_009325342.1', 'ENH71300.1', 'OQV15857.1', 'XP_009491707.1', 'OWA50099.1', 'XP_014141090.1', 'PIK49892.1', 'KFW97279.1', 'XP_010143437.1', 'XP_009687679.1', 'KFV88821.1', 'KFP11414.1', 'KFP76326.1', 'XP_011669947.1', 'XP_009271908.1', 'KFP81921.1', 'PIK51743.1', 'KFV92898.1', 'KFQ07009.1', 'KF

In [62]:
print ("The following sequences are only in the full PSI_BLAST search from Python not restricted to 2U1 and 2U1-like sequences")
print (full_psi_blast_set - expanded_blast_set - psi_blast_set)

The following sequences are only in the full PSI_BLAST search from Python not restricted to 2U1 and 2U1-like sequences
{'XP_002594674.1', 'AIC57535.1', 'XP_002602578.1', 'AGN04330.1', 'XP_002604226.1', 'EFB15022.1', 'KQK74446.1', 'XP_020484540.1', 'EHH53889.1', 'OXB57021.1', 'NP_001186898.2', 'AGN04312.1', 'XP_002589623.1', 'AGN52902.1', 'XP_002605379.1', 'NP_001085963.1', 'XP_002601369.1', 'AGN04325.1', 'ELR59612.1', 'XP_002589617.1', 'AGN04309.1', 'XP_019936176.1', 'AGN04296.1', 'XP_002604856.1', 'KPP65943.1', 'XP_002601386.1', 'XP_002589622.1', 'EDL82214.1', 'XP_002603880.1', 'NP_001103192.1', 'XP_002590802.1', 'EPY81189.1', 'XP_002610285.1', 'NP_001070926.1', 'EFB16740.1', 'EHH49955.1', 'XP_002588060.1', 'XP_002594971.1', 'XP_002612185.1', 'XP_002590181.1', 'XP_002607601.1', 'OCA37567.1', 'XP_002613380.1', 'EHH26107.1', 'NP_001079789.1', 'OCT76966.1', 'XP_002587334.1', 'AGN04307.1', 'AGN04284.1', 'XP_002597633.1', 'XP_002585682.1', 'ARO89866.1', 'XP_002613379.1', 'XP_002602976.1', 

In [54]:
print ("The following sequences are only in original expanded BLAST set made from 5 candidates and restricted to 2U1 and 2U1-like")
print (expanded_blast_set - full_psi_blast_set - psi_blast_set)

The following sequences are only in original expanded BLAST set made from 5 candidates and restricted to 2U1 and 2U1-like
{'XP_019620665.1', 'XP_005111348.2', 'XP_014674645.1', 'XP_002738775.1', 'XP_002735731.1', 'XP_019624571.1', 'XP_009858778.2', 'XP_013091701.1', 'XP_019642956.1', 'XP_022081995.1', 'XP_786300.2', 'XP_022103600.1', 'XP_022098029.1', 'XP_019618599.1', 'XP_019631170.1', 'XP_022093163.1', 'XP_019641598.1', 'XP_019642928.1', 'XP_013065835.1', 'XP_019624239.1', 'XP_002740465.1', 'XP_019619827.1', 'XP_019628033.1', 'XP_019627490.1', 'XP_013922675.1', 'XP_018667429.1', 'XP_002739928.1', 'XP_013091704.1', 'XP_022094290.1', 'XP_019627491.1', 'XP_014674505.1', 'XP_022300688.1', 'XP_019615816.1', 'XP_013065833.1', 'XP_013420531.1', 'XP_014788787.1', 'XP_018667653.1', 'XP_019629979.1', 'XP_006821776.1', 'XP_022087533.1', 'XP_019615834.1', 'XP_006812435.1', 'XP_022099104.1', 'XP_006812434.1', 'XP_006823767.1', 'XP_019642618.1', 'XP_022103601.1', 'XP_022082638.1', 'XP_013774212.1'