### Data Analytics Assignment - 3
### Association Rule Mining

### Group Members:
R Mukesh (CED15I002),
Kiran Robert (EVD15I007),
Gajaraj G. (MPD15I011)

### Read the DNA Sequences data

In [1]:
from Bio import SeqIO

# Add lib directory to system path
import sys
sys.path.append('../lib/')

dna_sequences_records = SeqIO.parse('../datasets/DNASequences.fasta', 'fasta')

# extract the DNA sequences as string
dna_sequences = [dna_sequence.seq._data for dna_sequence in dna_sequences_records]

### Perform subsequence mining using GSP algorithm

In [2]:
# Truncate length of each DNA sequence string
dna_sequences_trunc = [dna_sequence[:1000] for dna_sequence in dna_sequences]

# Perfrom subsequence mining using GSP
from gsp import GSP

min_support = 0.8
frequent_subsequences = GSP(dna_sequences_trunc).search(min_support)

# present the data
for i, i_dna_subsequeces in enumerate(frequent_subsequences):
    print("Frequent Subsequences of length: {}".format(i+1))
    for dna_subsequence in i_dna_subsequeces.keys():
        print(''.join(dna_subsequence))
    print()

Frequent Subsequences of length: 1
T
G
A
C

Frequent Subsequences of length: 2
GC
CA
GG
AT
TT
GA
TG
TC
AA
TA
AC
CT
CG
AG
GT
CC

Frequent Subsequences of length: 3
TAC
GCA
TCC
GCG
AAC
CTA
CGA
GCC
TTT
AAG
TGT
CGG
ATT
GGC
ACT
GTG
CGC
AGA
GTC
GTA
TTA
CCC
TAG
CCG
GAA
GAC
TAA
ACA
GAG
CTC
AGT
AGG
AGC
CTG
GGA
TCT
TCG
GAT
GTT
ACG
GGG
ATG
CAC
CAA
ACC
AAA
ATC
TAT
CAG
TTC
CAT
GCT
GGT
CCA
TGA
ATA
CGT
TGG
CCT
TTG
AAT
TGC
CTT
TCA

Frequent Subsequences of length: 4
CCTA
CTCA
ACTT
AGCA
CAGC
ACAA
ACAC
CCCT
ATAG
AAAT
TCCA
GCTT
TTTT
TGTG
AAAA
ATCC
ATAA
ATCA
TGCC
AAGT
GACC
GGTG
ACCT
CTTT
CACC
TCAT
TATC
GTAC
CTTG
GAGT
TTGT
TCAG
CCAC
CCAA
GGTC
TCCT
GTTT
GTGT
TACT
AGAA
CTGC
TAGG
TTAG
CTTC
AGTC
GGCA
GATT
GATG
TGGT
CCAG
GAGC
ATTA
GGGA
CAAC
CCCC
CAAG
GTGG
GTTG
TCCC
TCTA
AGAC
GTAG
GGAA
GATC
TTCC
AACA
CAGT
GAGG
GCCA
GCTC
AGGC
AACT
AGCT
GAAC
GGTT
ATCT
AAGG
TACC
ACTA
GGGC
CTAC
GCAC
TCTT
GAGA
GCTG
TTGC
GTCT
AAAG
CAGG
AGTA
TCAC
TTCT
GACT
CACT
GTCC
ACTG
GGGG
CTAG
TGCT
CACA
CCCA
GAAT
TCAA
CCTG
TGTT
ATGA
TAAG
GTAA
ACAT


### Analyzing relationships between occurrences of proteins using Association Rule Mining

#### Generating the proteins in DNA sequences

In [3]:
dna_sequences_records = SeqIO.parse('../datasets/DNASequences.fasta', 'fasta')

# Using the standard table
table_number = 1
min_protein_length = 50

# Extract the proteins from all dna_sequences
dna_sequences_proteins = []

for dna_sequence in dna_sequences_records:
    dna_sequence_proteins = set()
    
    for strand, nuc in [(+1, dna_sequence.seq), (-1, dna_sequence.seq.reverse_complement())]:
        for frame in range(3):
            length = 3*((len(dna_sequence)-frame)//3)
            for protein in nuc[frame:frame+length].translate(table_number).split('*'):                               
                if(len(protein)>=min_protein_length):
                    dna_sequence_proteins.add(protein._data)
                    
    dna_sequences_proteins.append(list(dna_sequence_proteins))

#### Finding association rules for occurrences of proteins (using FP-Growth)

In [4]:
import pyfpgrowth

# Parameters for FP-growth
min_support = 15

patterns = pyfpgrowth.find_frequent_patterns(dna_sequences_proteins, min_support)

min_confidence = 0.8
rules = pyfpgrowth.generate_association_rules(patterns, min_confidence)

for antecedent, consequent in rules.items():
    print("{} => {}".format(antecedent, consequent[0]))
    print("confidence = {}\n".format(consequent[1]))

('ALLLIMFIPLIFLPQLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH', 'GWENRPIGRESQCLSETQESSLSPHAQFLLVSLNLSCNLDTNLPRASPPTSSTFTLPHRAVTADFSSGVRCTMVSV', 'IVTDVRGFILLIAATIQLPFCFYFMVGIRLDYSESKLGPFANHVHTSYLPPTAPGQRAGLCAGPSLWQRIHPTSAGCLSESGGWCG', 'SCSYLLSSSHSSWATCWSVCWPITLAKNSPHQCRLPIRKWWLVWLMPWPTSITKLAFLLSNFY') => ('KLDSKKASLVILVGQGISHTSHHFLIGSLHWWGEFFAKVMGQHTDQHVAQELWEEDKRYEHD', 'QPQTDTMVHLTPEEKSAVTALWGKVNVDEVGGEALGRLVSRLQDRFKETNRNWACGDREDSWVSDRH')
confidence = 1.0

('ALLLIMFIPLIFLPQLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH', 'IVTDVRGFILLIAATIQLPFCFYFMVGIRLDYSESKLGPFANHVHTSYLPPTAPGQRAGLCAGPSLWQRIHPTSAGCLSESGGWCG', 'KLDSKKASLVILVGQGISHTSHHFLIGSLHWWGEFFAKVMGQHTDQHVAQELWEEDKRYEHD') => ('QPQTDTMVHLTPEEKSAVTALWGKVNVDEVGGEALGRLVSRLQDRFKETNRNWACGDREDSWVSDRH',)
confidence = 1.0

('ALLLIMFIPLIFLPQLLGNVLVCVLAHHFGKEFTPPVQAAYQKVVAGVANALAHKYH', 'IVTDVRGFILLIAATIQLPFCFYFMVGIRLDYSESKLGPFANHVHTSYLPPTAPGQRAGLCAGPSLWQRIHPTSAGCLSESGGWCG', 'KLDSKKASLVILVGQGISHTSHHFLIGSLHWWGEFFAKVMGQHTDQHVAQELWEEDKRYEHD', 'SCSYLLSSSHS