# CSC219_Module1_FASTA_B_Processing
## (loading files, parsing, and processing)

## Importing using ftp (optional)
* you can pull FASTA files using ftp if the site provides such service
* some FASTA files contain multiple sequences. When writing code to parse or interpret FASTA files be mindful of this and account for this possibility

* extentions examples: '.fa', '.fasta', '.fna', '.mfa'


In [None]:
import gzip
import urllib.request
url = 'ftp://ftp.ncbi.nlm.nih.gov/ncbi-asn1/protein_fasta/gbinv17.fsa_aa.gz'
response = urllib.request.urlopen(url)
mFastaFile = gzip.decompress(response.read()).decode('UTF8')
print(mFastaFile)

>UOK10426.1 cytochrome oxidase subunit 1, partial (mitochondrion) [Apodiptacus acutissimae]
TMYFFFGFWTAFYGSSLSMIMRLELSQPGSFIFNDHLYNVFVTSHAFIMIFFVVMPILMGGFGNWLIPMM
LGCHDMHFPRMNNLSFWLLLPAMIFLILSSYISLGAGTGWTVYPPLSSISFHTDISVDLAIFSLHLGGVS
SILSSINFITTIFIMRLSGLSMERMPLFIWSMLVTSFLLLFSLPVLAGAITMLLTDRNINTSFFDPVGGG
DPILYQHLF
>ADH54549.1 cytochrome oxidase subunit 1, partial (mitochondrion) [Lepidoptera sp. BOLD:AAA6798]
TLYFIFGVWAGMVGTSLSLLIRAELGNPGSLIGDDQIYNTIVTAHAFIMIFFMVMPIMIGGFGNWLVPLM
LGAPDMAFPRMNNMSFWLLPPSLILLISSSIVENGAGTGWTVYPPLSSNIAHGGSSVDLAIFSLHLAGIS
SILGAINFITTIINMRLNNMSFDQMPLFIWAVGITAFLLLLSLPVLAGAITMLLTDRNLNTSFFDPAGGG
DPILYQHLF
>ALK62027.1 cytochrome oxidase subunit 1, partial (mitochondrion) [Psyllobora borealis]
FLFGMWAGMVGTSLSILIRLELGTTNSVIGNDQIYNVIVTAHAFIMIFFMVMPIMIGGFGNWLVPLMIGA
PDMAFPRLNNMSFWLLPPALTLLIFSSMVEMGAGTGWTVYPPLSANIAHSGSSVDLVIFSLHLAGISSIL
GAVNFISTIMNMRPLGMNLDKTPLFVWSVLITAILLLLSLPVLA
>AOQ94351.1 cytochrome oxidase subunit 1, partial (mitochondrion) [Scambus sp. BOLD-2016]


## Importing via Google drive
  a. download FASTA files from the internet (see Resources at the end) or use the ones provided in the shared drive

  b. upload them to the Google drive

# 1) Mount Drive

In [None]:
#mount your drive; connect your Google colab drive to your Google drive to access data files, give permission
from google.colab import drive
drive.mount('/content/drive/') #connecting Google drive


Mounted at /content/drive/


# 2) Assemble file paths
full directory path +  file name + extension



In [None]:
import os

# path variable for convenience, so you don't have to type the path all the time
path = '/content/drive/MyDrive/SFSU/CSC219/dataFiles/FASTA/'

In [None]:
! pwd
! ls
# or use the side panel

/content
drive  sample_data


In [None]:
# FASTA files
fileNameList = ['Sars_Cov2_NC_004718_3', 'ΗCoV_EMC_019843_3']

def getFilePathList(path, nameList):
  '''
    make file path variables, full directory path +  file name + extension
  '''
  filePathList = []
  for f in nameList:
    filePathList.append(path + f +'.fasta')
  return filePathList


In [None]:
filePathList = getFilePathList(path, fileNameList)
print("file path list: ", filePathList)

file path list:  ['/content/drive/MyDrive/SFSU/CSC219/dataFiles/FASTA/Sars_Cov2_NC_004718_3.fasta', '/content/drive/MyDrive/SFSU/CSC219/dataFiles/FASTA/ΗCoV_EMC_019843_3.fasta']


# 3) Loading the files (to the program)

In [None]:
# Open file from the given file path (1 file per time)
def getFileHandle(filePath):
  if not os.path.exists(filePath): # error handling
    raise Exception("Path/file not found!")
  fileHandle = open(filePath)  # opens the file
  '''will need close()'''

  return fileHandle

# 4) Parse sequences into dictionary

In [None]:
def loadSequences(filePathList) -> dict:
    seqDictionary = {} # {"name": "content"}
    for i,fp in enumerate(filePathList): #need i later
      fileHandle = getFileHandle(fp)  # opens the file by calling function
      # print("file path ", fp)

      linesList = fileHandle.readlines()[1:] # reads lines into list except for 1st line, each line from the file is an element on the list

      tempSeq = ""
      for line in linesList: # iterates through each line
        cleanString = line.strip('\n') #strips the \n out of each line string
        tempSeq += cleanString

      # print("filename ", fileNameList[i])
      seqDictionary[fileNameList[i]] = tempSeq

      fileHandle.close()
    return seqDictionary


In [None]:
# use loadSequences function
print("Testing out the loadSequences function by using the path list created earlier ")
sequencesDict = loadSequences(filePathList)
print()
print("One of the sequences: ")
print(sequencesDict["ΗCoV_EMC_019843_3"])


Testing out the loadSequences function by using the path list created earlier 

One of the sequences: 
ATGATACACTCAGTGTTTCTACTGATGTTCTTGTTAACACCTACAGAAAGTTACGTTGATGTAGGGCCAGATTCTGTTAAGTCTGCTTGTATTGAGGTTGATATACAACAGACTTTCTTTGATAAAACTTGGCCTAGGCCAATTGATGTTTCTAAGGCTGACGGTATTATATACCCTCAAGGCCGTACATATTCTAACATAACTATCACTTATCAAGGTCTTTTTCCCTATCAGGGAGACCATGGTGATATGTATGTTTACTCTGCAGGACATGCTACAGGCACAACTCCACAAAAGTTGTTTGTAGCTAACTATTCTCAGGACGTCAAACAGTTTGCTAATGGGTTTGTCGTCCGTATAGGAGCAGCTGCCAATTCCACTGGCACTGTTATTATTAGCCCATCTACCAGCGCTACTATACGAAAAATTTACCCTGCTTTTATGCTGGGTTCTTCAGTTGGTAATTTCTCAGATGGTAAAATGGGCCGCTTCTTCAATCATACTCTAGTTCTTTTGCCCGATGGATGTGGCACTTTACTTAGAGCTTTTTATTGTATTCTAGAGCCTCGCTCTGGAAATCATTGTCCTGCTGGCAATTCCTATACTTCTTTTGCCACTTATCACACTCCTGCAACAGATTGTTCTGATGGCAATTACAATCGTAATGCCAGTCTGAACTCTTTTAAGGAGTATTTTAATTTACGTAACTGCACCTTTATGTACACTTATAACATTACCGAAGATGAGATTTTAGAGTGGTTTGGCATTACACAAACTGCTCAAGGTGTTCACCTCTTCTCATCTCGGTATGTTGATTTGTACGGCGGCAATATGTTTCAATTTGCCACCTTGCCTGTTTATGATACTATTAAGTATTATTCTATCATTCCTCACAGT

# 5) Processing data with counters
* In these examples, we are going to go through different ways to count nucleotides (A,T,G,C) in sequences, and store them in different data structures


### 5.a. Simple counter with simple data

In [None]:

'''
  simple counter function counts nucleotides in strings or lists
'''
#using the simplest way to count: one counter for each nucleotide
def countNuc(DNA): # 'ATGCCC'
    countG = 0
    countC = 0
    countA = 0
    countT = 0
    for x in DNA:
        if x == "G":
          countG += 1
        elif x == "C":
          countC += 1
        elif x == 'A':
          countA +=1
        else:
          countT +=1

    return [countA,countT,countG,countC]


exDNA = ['A','A','T','T','G','C']
print("A simple counter function that keeps a count variable for each nucleotide from a list")
print()
print("Example with a simple and short sequence in list format: ", exDNA)
countList = countNuc(exDNA)
print("list of counts ", countList)
print()
print("Using a sequence in string format to test out countNuc function: (still works)",)
exDNAStr = "AATTGC"
countList = countNuc(exDNAStr)
print("list of counts ", countList)

A simple counter function that keeps a count variable for each nucleotide from a list

Example with a simple and short sequence in list format:  ['A', 'A', 'T', 'T', 'G', 'C']
list of counts  [2, 2, 1, 1]

Using a sequence in string format to test out countNuc function: (still works)
list of counts  [2, 2, 1, 1]


### 5.b. Counter using the Counter object and simple data

In [None]:
from collections import Counter

#using the Counter class
#returns a Counter object, that is a subclass of dictionary

def countNuc(DNA) -> Counter:
  return Counter(DNA)

Counter({'A': 2, 'T': 2, 'G': 1, 'C': 1})


2

In [None]:
exDNA = ['A','A','T','T','G','C']
builtInCounter = countNuc(exDNA)

print(builtInCounter)
# builtInCounter["A"]

Counter({'A': 2, 'T': 2, 'G': 1, 'C': 1})


In [None]:
print("A simple counter function using the Counter object")
print()
print("Example with a simple and short sequence in list format: ", exDNA)

print("builtInCounter is a ", type(builtInCounter), " object.")

print("sorted sequence: ", sorted(builtInCounter.elements()))
print("Counters for each nucleotide ")
for k,v in builtInCounter.items():
  print(k, ": ", v)

A simple counter function using the Counter object

Example with a simple and short sequence in list format:  ['A', 'A', 'T', 'T', 'G', 'C']
builtInCounter is a  <class 'collections.Counter'>  object.
sorted sequence:  ['A', 'A', 'C', 'G', 'T', 'T']
Counters for each nucleotide 
A :  2
T :  2
G :  1
C :  1


In [None]:
print("Using a sequence in string format: (still works)",)

exDNAStr = "AATTGC"
counterObject = countNuc(exDNAStr)

for k,v in counterObject.items():
  print(k, ": ", v)

Using a sequence in string format: (still works)
A :  2
T :  2
G :  1
C :  1


### 5.c. Counter using a dictionary, with simple data and FASTA data

In [None]:

#using 'REGULAR' dict

def countNuc(DNA):
  # it will look like: counterDictionary = { 'G': 10, 'A: '45', ....}
  counterDictionary = {} #regular dictionary

  for x in DNA:
    if x not in counterDictionary.keys(): # If the key does not exist in the dictionary,
      counterDictionary[x] = 0  # then create one
    counterDictionary[x] +=1 #behind scenes: counterDictionary['G'] +=1, next loop counterDictionary['A'] +=1, counterDictionary['A'] +=1,
  return counterDictionary



In [None]:
#simple example:
exDNA = ['C','A','A','T','T','G','C']
print("Using dictionary")
print()
print("Example with a simple and short sequence in list format: ", exDNA)
print(countNuc(exDNA))
print()



Using dictionary

Example with a simple and short sequence in list format:  ['C', 'A', 'A', 'T', 'T', 'G', 'C']
{'C': 2, 'A': 2, 'T': 2, 'G': 1}



In [None]:
def prettyPrint(paramDict):
  for k,v in paramDict.items():
    print(k, ":")
    print(v)

In [None]:
#example with the sequences from FASTA files
print("Example with sequences from FASTA files in string format ",)
print("Nuc counts for each sequences from the FASTA files:  ", )
sequencesDict = loadSequences(filePathList)

print()
prettyPrint(sequencesDict)
print()

for name,seq in sequencesDict.items():
  nucCountDict = countNuc(seq)

  print("Species: ", name)
  print(" has the following nuc counts in a dictionary: ", nucCountDict )


Example with sequences from FASTA files in string format 
Nuc counts for each sequences from the FASTA files:  

Sars_Cov2_NC_004718_3 :
ATGTTTATTTTCTTATTATTTCTTACTCTCACTAGTGGTAGTGACCTTGACCGGTGCACCACTTTTGATGATGTTCAAGCTCCTAATTACACTCAACATACTTCATCTATGAGGGGGGTTTACTATCCTGATGAAATTTTTAGATCAGACACTCTTTATTTAACTCAGGATTTATTTCTTCCATTTTATTCTAATGTTACAGGGTTTCATACTATTAATCATACGTTTGGCAACCCTGTCATACCTTTTAAGGATGGTATTTATTTTGCTGCCACAGAGAAATCAAATGTTGTCCGTGGTTGGGTTTTTGGTTCTACCATGAACAACAAGTCACAGTCGGTGATTATTATTAACAATTCTACTAATGTTGTTATACGAGCATGTAACTTTGAATTGTGTGACAACCCTTTCTTTGCTGTTTCTAAACCCATGGGTACACAGACACATACTATGATATTCGATAATGCATTTAATTGCACTTTCGAGTACATATCTGATGCCTTTTCGCTTGATGTTTCAGAAAAGTCAGGTAATTTTAAACACTTACGAGAGTTTGTGTTTAAAAATAAAGATGGGTTTCTCTATGTTTATAAGGGCTATCAACCTATAGATGTAGTTCGTGATCTACCTTCTGGTTTTAACACTTTGAAACCTATTTTTAAGTTGCCTCTTGGTATTAACATTACAAATTTTAGAGCCATTCTTACAGCCTTTTCACCTGCTCAAGACATTTGGGGCACGTCAGCTGCAGCCTATTTTGTTGGCTATTTAAAGCCAACTACATTTATGCTCAAGTATGATGAAAATGGTACAATCACAGATGCTGTTGATTGTTCTCAAAATCCACTTGCTGAACTCAAATG

### 5.d. Counter using defaultdict, with simple data and FASTA data

In [None]:
#using defaultdict

from collections import defaultdict
def countNuc(DNA):
  defDictionary = defaultdict(int) #defaultdict creates a default value of 0 for non-existing keys
  for x in DNA:
    defDictionary[x] +=1 #behind scenes: counterDictionary['G'] +=1, next loop counterDictionary['A'] +=1, counterDictionary['A'] +=1,
  return defDictionary

In [None]:
#simple example:
exDNA = ['C','A','A','T','T','G','C']
print("Using a defaultdict this time. defaultdict creates a default value of 0 for non-existing keys")
print()
print("Example with a simple and short sequence in list format: ", exDNA)
print(countNuc(exDNA))
print()

In [None]:
#example with the sequences from FASTA files
print("Example with sequences from FASTA files in string format ",)
print("Nuc counts for each sequences from the FASTA files: structure is dictionary of defaultdict" )

sequencesDict = loadSequences(filePathList)
for name,seq in sequencesDict.items():
  print("Species: ", name)
  nucCountDict = countNuc(seq)
  print(" has the following nuc counts in a dictionary: ", nucCountDict )

Example with sequences from FASTA files in string format 
Nuc counts for each sequences from the FASTA files: structure is dictionary of defaultdict
Species:  Sars_Cov2_NC_004718_3
 has the following nuc counts in a dictionary:  defaultdict(<class 'int'>, {'A': 1053, 'T': 1254, 'G': 705, 'C': 756})
Species:  ΗCoV_EMC_019843_3
 has the following nuc counts in a dictionary:  defaultdict(<class 'int'>, {'A': 1021, 'T': 1384, 'G': 785, 'C': 872})


#6) Random sequences

* generating random sequences of DNA can be useful when assessing the significance of results from sequence data analysis
* you can use real sequences to generate random sequences
* for example, there are studies that consider random sequences as new sources of molecules to research cellular functions
* you can also generate random sequences from real sequences and compare them statistically to observe their similarities and differences



##6.a. random.sample() - loop (review)
* generating lists of random sequences in a loop, converting them to strings
* sample() assembles a random sequence of unique elements, taken from a given set
* the size of the random sequence can't exceed the given sequence


In [None]:
#exampleList = ["fart", "cake", "banana", "tree"]
#random.sample(exampleList, len(exampleList))
#random.sample(ecampleList[i], len(exampleList[i]))
#exampleListShuffled = ()
import random

#simple sequence example:
listOfSequences = ['AGTCGGAATCGT', 'ATTGCCGTA', 'TGTTTCTACTGATGT', 'TCTTGTTAACACCTA','CAGAAAGTTACGTTG', 'ATGTAGGCA']

# create random sequences from samples of nucleotide sequences
for realSeq in listOfSequences:
  print("len of the real seq ", len(realSeq))

  randomSeq = random.sample(realSeq, len(realSeq))
  print("random seq generated from real seq (in list format): ", randomSeq) # returns a list with same length as sample set

  #converting list into string
  inStringFormat = ''.join(randomSeq)
  print("in string format", inStringFormat)
  print()


len of the real seq  12
random seq generated from real seq (in list format):  ['T', 'C', 'A', 'G', 'G', 'A', 'T', 'G', 'C', 'T', 'A', 'G']
in string format TCAGGATGCTAG

len of the real seq  9
random seq generated from real seq (in list format):  ['A', 'T', 'G', 'A', 'T', 'G', 'C', 'T', 'C']
in string format ATGATGCTC

len of the real seq  15
random seq generated from real seq (in list format):  ['G', 'T', 'T', 'T', 'G', 'A', 'G', 'T', 'T', 'A', 'T', 'C', 'T', 'T', 'C']
in string format GTTTGAGTTATCTTC

len of the real seq  15
random seq generated from real seq (in list format):  ['T', 'T', 'A', 'T', 'C', 'T', 'C', 'C', 'A', 'A', 'G', 'T', 'C', 'T', 'A']
in string format TTATCTCCAAGTCTA

len of the real seq  15
random seq generated from real seq (in list format):  ['T', 'G', 'T', 'T', 'G', 'A', 'C', 'G', 'T', 'A', 'A', 'A', 'G', 'C', 'A']
in string format TGTTGACGTAAAGCA

len of the real seq  9
random seq generated from real seq (in list format):  ['C', 'T', 'A', 'G', 'G', 'T', 'G', 'A

## 6.b. random.sample() - list of strings (review)
* same as above, but we are going to store those random sequences strings in a list data structure
* generating lists of random sequences in a lop, converting them to strings, then putting them **in a list**


In [None]:
import random

#simple sequence example:
listOfSequences = ['AGTCGGAATCGT', 'ATTGCCGTA', 'TGTTTCTACTGATGT', 'TCTTGTTAACACCTA','CAGAAAGTTACGTTG', 'ATGTAGGCA']
listOfRandoms =[]#space holder of an empty list to "throw back at you"

for realSeq in listOfSequences:
  randomSeq = random.sample(realSeq, len(realSeq))
  inStringFormat = ''.join(randomSeq)
  listOfRandoms.append(inStringFormat)#put your generated string in the list, "put everything in the empty bag"
print("random list of sequences (in string format) derived from given list of real sequences ", listOfRandoms)


random list of sequences (in string format) derived from given list of real sequences  ['GCTGTAGACTAG', 'CTACTGATG', 'TGTCTTTTGAGTACT', 'CCTATTCTAACGATT', 'ATGGGGTAACACATT', 'TACGAATGG']


-------------


###Putting the code above into function


In [None]:
import random
def randomSeq(sample)-> str:
  result = []
  result = random.sample(sample, len(sample))
  return ''.join(result)
# what will happen if sample is a list?
# what will happen wif sample is a string?

#simple sequence example:
listOfSequences = ['AGTCGGAATCGT', 'ATTGCCGTA', 'TGTTTCTACTGATGT', 'TCTTGTTAACACCTA','CAGAAAGTTACGTTG', 'ATGTAGGCA']

for realSeq in listOfSequences:
  print(randomSeq(realSeq))

TATATCAGGGGC
CTAGCTGAT
GCTTGTATTCATTGT
GTTAACTCCTTATCA
AAATTGCAGTGACTG
GTTGAAGAC


## 6.c. random.sample() - dictionary (with data from FASTA files)
* use random.sample on a **dictionary** of **species names and corresponding sequences**, from the FASTA file content processed a few code blocks earlier
* Note: in order to run the following code blocks, you must run the previous code blocks (1 through 4) to mount drive, load the FASTA files, assemble file paths, and parse sequences into a dictionary to obtain the dictionary with sequences

In [None]:
"""
  must run code blocks above, from 1 through 4
  (mount drive, load the FASTA files, assemble file paths, and parse sequences into a dictionary)
  to obtain the variable sequencesDict
"""

# test it out to make sure you have sequencesDict populated:
# print(sequencesDict)
prettyPrint(sequencesDict)

Sars_Cov2_NC_004718_3 :
ATGTTTATTTTCTTATTATTTCTTACTCTCACTAGTGGTAGTGACCTTGACCGGTGCACCACTTTTGATGATGTTCAAGCTCCTAATTACACTCAACATACTTCATCTATGAGGGGGGTTTACTATCCTGATGAAATTTTTAGATCAGACACTCTTTATTTAACTCAGGATTTATTTCTTCCATTTTATTCTAATGTTACAGGGTTTCATACTATTAATCATACGTTTGGCAACCCTGTCATACCTTTTAAGGATGGTATTTATTTTGCTGCCACAGAGAAATCAAATGTTGTCCGTGGTTGGGTTTTTGGTTCTACCATGAACAACAAGTCACAGTCGGTGATTATTATTAACAATTCTACTAATGTTGTTATACGAGCATGTAACTTTGAATTGTGTGACAACCCTTTCTTTGCTGTTTCTAAACCCATGGGTACACAGACACATACTATGATATTCGATAATGCATTTAATTGCACTTTCGAGTACATATCTGATGCCTTTTCGCTTGATGTTTCAGAAAAGTCAGGTAATTTTAAACACTTACGAGAGTTTGTGTTTAAAAATAAAGATGGGTTTCTCTATGTTTATAAGGGCTATCAACCTATAGATGTAGTTCGTGATCTACCTTCTGGTTTTAACACTTTGAAACCTATTTTTAAGTTGCCTCTTGGTATTAACATTACAAATTTTAGAGCCATTCTTACAGCCTTTTCACCTGCTCAAGACATTTGGGGCACGTCAGCTGCAGCCTATTTTGTTGGCTATTTAAAGCCAACTACATTTATGCTCAAGTATGATGAAAATGGTACAATCACAGATGCTGTTGATTGTTCTCAAAATCCACTTGCTGAACTCAAATGCTCTGTTAAGAGCTTTGAGATTGACAAAGGAATTTACCAGACCTCTAATTTCAGGGTTGTTCCCTCAGGAGATGTTGTGAGATTCCCTAATATTACAAACTTGTGTCCTTTTG

### reading from the dictionary, create random sequences and put it on a list.

In [None]:
"""
  must run code blocks above, from 1 through 4
  (mount drive, load the FASTA files, assemble file paths, and parse sequences into a dictionary)
  to obtain the variable sequencesDict
"""
import random

randomSeqList =[]
for sp,realS in sequencesDict.items():
  randomSeqList.append(''.join(random.sample(realS, len(realS))))


'''
Code break down and explanation
sequencesDict -> {ItemA: ATCGATCGTACG..., ItemB: GCTAGCTAGCTA...}
1st loop:
  ItemA
  image -> len(realS) = 100
  random.sample() -> create a random sequence of length 100, in string format
  ["T", "A", "C", "G", "T", "T", "A", "A", "C", "C", "G", "G", ...]
  ''.join() -> convert the random sequence into a string "TACGTTAACCGG..."
  randomSeqList.append() -> [] -> ["TACGTTAACCGG..."]
2nd loop:
  ItemB
  image -> len(realS) = 19
  random.sample() -> create a random sequence of length 19, in string format
  ["G", "C", "A", "T", "G", "C", "A", "T", "C", "C", "G", "G", ...]
  ''.join() -> convert the random sequence into a string "GCATGCATCCGG..."
  randomSeqList.append() -> [] -> ["TACGTTAACCGG...", "GCATGCATCCGG"]
'''

print("take a peek of the dictionary with slice: ")
print(randomSeqList[0][:5])

print()
print("list of randomly generated sequences: \n", randomSeqList)

take a peek of the dictionary with slice: 
TTACA

list of randomly generated sequences: 
 ['TTACAACATTCTACAAATTTGTGATGATTATTCCTTACACTCATTCCATCTGGGGCCTGCTTTTACGTTAAGTCGCCTAATGCTTTCTATATCACTCAACAAAAAGTCTACATTCATTGATTCGACGTCGCATAGAATTATCCATTGATTTACCACGTTTGCTATGAAGCAATTGGTTTGTAGCCTCTTTTCTAGGCGGGTCTGTGCCGAACAAAGCTCCCTCTGTTTCATTAGGGGCGTCTGTCCACTGTTCTAGCGGACCTATTGGTCAATATTTTATTCTCAATTCATCTTTCTATGGTTCTAATCCGCTTACACTCCAATCTGAGCCCCAGTACTGAAAATCGTTGCTTTTAATCAAGTTCAGATTTAACATATACCAAGAATTTCAATTAACGCACTTTATATAGAGGCTTATAGTATTGATTTTCTTATTACAAGGGGCCGAATACAACTGTCTATTCACTCTCAGTGCAGGAGAGACCCGGTGACCTGTTTCCTTCAAAAAGTAACGTGCCGTTTCATTTAAGTGGATTAAGTTTTGTACGATGTTACATTGAACCGATGTCGGAAACAATAGAAGCCATAAGGCCTACATTCGATATTGGGGGTTTATTCGAGACACTCGTACTTTTTCAATTGATTTGTTCCATTTACTGTGTTTAATTTACAACCCAGGTATGGACATGTTCAACACGTGTTCTTAACTAGTTTGGGTTGTACCTCCCTTAATTGCAATTAGAATCTCCCTCCCCGATTCATATAGACATGAAGTTCCATATATGTGTTATAAGCGTTTGTTATTAGATCCCAGATTCAAGAACTCTCACCATAAACTTCATACACTATATTAGAACAATGTAACACCATTCTTGTTTGGTGGCGGACGAAAGCATGATGCCTGTCAA

### building a function, create a dictionary with random sequences

* creating a function, reading from the dictionary, create random sequences and put it on a dictionary.

In [None]:
"""
  must run code blocks above, from 1 through 4
  (mount drive, load the FASTA files, assemble file paths, and parse sequences into a dictionary)
  to obtain the variable sequencesDict
"""
import random

def randomSeqListGenerator(sequenceDictionary) -> dict:
  randomSeqDictionary ={}
  for spName,realSeq in sequenceDictionary.items():
    randomSeqDictionary[spName] = ''.join(random.sample(realSeq, len(realSeq)))

  return randomSeqDictionary

randomSequencesDict = randomSeqListGenerator(sequencesDict)
print("\nthe whole dictionary of random sequences: \n", randomSequencesDict)

for species,randomSeq in randomSequencesDict.items():
  print("\nrandom sequence derived from real sequence from ", species, ": \n", randomSeq)


the whole dictionary of random sequences: 
 {'Sars_Cov2_NC_004718_3': 'ATCCACATTCGAGGCTATTTGGCGCAATGAATAAGAGTTATAGTTTGTAATTGGTAACAGTTTAATGATGGGCAAGAATTTGATAGATCAACGCAAAACAAGTATAGCTAATGATTTGCTGGTAGATCCCATGTATTATTACCATGCACAGCTATGTATATTATTATTGGCCTTGATCGGGTTATTTCTAGATTGCAGTAAGGGAAAGAATTTCATTCTATTTGTCTTCGACTCTCTAATTTCAAGCAATAGAGTATAGCTGGTTGCCCAAATTTACTCTATAACTGTTCCAGGTGAGTTTTAAAAGTAGCTATAACGCGCTCTCCACATAGTCGCTCTTCAACTGAATATAAGATTGAACATAATATTGTTATTGGCATAGTTAGCTTCGTTCATTTACGACTTATCTCCAGAGTTGTAGGCTTGTAAGCCAAACACACTTCCCGCTTGTTAGCCGGGGACACATAGTTACCTTGATTCAACGGATAGCTCTCATTAGTACCTCCTGAACAGGGGATACATAAGTTACACAAATGAGCCATTGTAGCCTACTACTCAACGTTGCTGGTAACACTTGTAACACTGACCCGTCACCAAGTAGTTACGGTAGGGGTACAAAGATTTACTACCTCCAGTCAAGAAGACTCCGCCTCATCGTAGTTACTTCACGTCTATATGGAGTGATCATATGTCTCTGTCTGCCTTGCGCCACATAGGTTGTTGATGACAAAACTGAATTGTGAATGTTATCTTAAGCATACGCGTCATAGTTCTAAATGCAGTTATCTGCTCGGTAAATCGTCAGCTGATTCATTAGAGCATATGCACGTCATCCACAATACATCACTAAAGTGTCTTTTGTAACATTCATTGGTATCCTAAATCTGAGAGTATAAGATCAAGATTATTACTTTTTGGTTTAATTTGG

#7) Resources
* You can browse and download your own nucleotide sequences in FASTA format
* example:  https://www.ncbi.nlm.nih.gov/nuccore/OZ022789.1?report=fasta

