# Naive Matching and Analysis Practice

In [13]:
from geneReader import geneReader

from naiveAware import naiveAware

from reverseComplement import reverseComplement

from naive_mismatches import naive_mismatches

from unawareNaive import unawareNaive

## Processing a FastA File

In [14]:
filename = 'lambda_virus.fa'

data = open ( filename, 'r' )

reads = geneReader ( filename )

data.close ()

In [15]:
pattern = 'AGGAGGTT'

# the reverse complement pattern

rcPattern = reverseComplement ( pattern )

# strand aware naive matching of the pattern 

results = naiveAware ( pattern, reads )

# strand aware naive matching of the reverse complement

rcResults = naiveAware ( rcPattern, reads )

# naive matching allowing up to 2 mismatches in the sequence reads

mmResults = naive_mismatches ( pattern, reads )

# the non-strand aware version of naive matching

unawareResults = unawareNaive ( pattern, reads )

In [16]:
print ( rcPattern )

AACCTCCT


In [17]:
print ( 'offset of leftmost occurrence: %d' % min ( results ) )

offset of leftmost occurrence: 38030


In [18]:
print ( 'offset of leftmost rc occurrence: %d' % min ( rcResults ) )

offset of leftmost rc occurrence: 38030


In [19]:
print ( 'occurrences: %d' % len ( results ) )

occurrences: 2


In [20]:
print ( 'rc occurrences: %d' % len ( rcResults ) )

rc occurrences: 2


In [21]:
print ( 'offset of lefmost occurence with up to 2 mismatches : %d' % min ( mmResults ) )

offset of lefmost occurence with up to 2 mismatches : 49


In [22]:
print ( 'mm occurrences: %d' % len ( mmResults ) )

mm occurrences: 215


In [23]:
print ( 'offset of leftmost unware occurrence: %d' % min ( unawareResults ) )

offset of leftmost unware occurrence: 38030


In [24]:
print ( 'unaware occurrences: %d' % len ( unawareResults ) )

unaware occurrences: 1


## Processing a FastQ File

In [28]:
from geneReader_Q import geneReader_Q

filename = 'ERR037900_1.first1000.fastq'

data = open ( filename, 'r' )

reads = geneReader_Q ( filename )

data.close ()

In [29]:
import collections

from collections import Counter

# My code for reading the fast_Q file, and then determining which read cycle is bad based on position of the N's.

readLines = geneReader_Q ( filename )

nPos = []

letter = "N"

# Loop that will create a tally of how many times each numerical index with a letter N occurs.

for i in readLines : 
                    
    num = i.rfind ( letter )

    nPos.append ( num )

### Analysis of reads to discover the faulty cycle.

In [30]:
import pandas as pd

# Converting the list to a pandas series array. 

nPos2 = pd.Series ( nPos )

# Counting the unique values from the 

count = collections.Counter ()

# Loop to update counts line by line from the FastQ reads. 

for read in readLines:
    
    count.update ( read )
    
print ( count )

print ( Counter ( nPos2 ) ) 

print ( Counter ( nPos2 ).keys () )

print ( Counter ( nPos2 ).values () )

print ( len ( readLines [ 0 ] ) )

nPos2

Counter({'C': 29665, 'A': 24057, 'G': 22888, 'T': 22476, '\n': 1000, 'N': 914})
Counter({66: 901, -1: 97, 91: 1, 92: 1})
dict_keys([66, 91, -1, 92])
dict_values([901, 1, 97, 1])
101


0      66
1      91
2      66
3      66
4      66
       ..
995    66
996    66
997    66
998    66
999    66
Length: 1000, dtype: int64

_______

### Conclusion: 

The faulty cycle very likely occurs at cycle 66 / 100 in the sequencer. The analysis shows that over 900 out of 1000 total lines contain an not identifiable nucleotide at cycle 66. 