In [2]:
BASE_PATH = '/content/drive/MyDrive/Health/'
DATASET_PATH = BASE_PATH + 'dataset/'
VOCAB_PATH = '/content/drive/MyDrive/Health/vocab.txt'

TEST_SET_PATH = BASE_PATH + 'test_1000-1099.fas'

URL_DRIVE = 'https://drive.google.com/uc?id='

# BASE DATASET
# G_TRAIN_FILE = URL_DRIVE + '1-1eXBY8yHHmWDlzr-gZXAX8POor-P1xs'
# G_DEV_FILE   = URL_DRIVE + '1bo_JKi6SbcRH8l-2GhIcdl1AnWvNaOYK'
# G_TEST_FILE  = URL_DRIVE + '1-2zqIzM3FgsfTi4jJmC2wBb9KHjKk_xx'

# K512 DATASET
G_TRAIN_FILE = URL_DRIVE + '1-ESFbZab0N7npe1Q01549CqEKLfQoCDd'
G_VALID_FILE   = URL_DRIVE + '1-GSyPyObFCwFqa8amuizDIQbE51sPAKw'
G_TEST_FILE  = URL_DRIVE + '1-8EQbga_UpaV3wwBxuUc7dzij_LJzQ3a'

TRAIN_FILE = 'dataset/train/train.tsv'
VALID_FILE = 'dataset/train/dev.tsv'
TEST_FILE = 'dataset/test/dev.tsv'

# ID_LABELS
TRAIN_ID_LABELS = URL_DRIVE + '1_D-b0-R4ybQUqzjA8rHxjvNQxBF36V-G'
VALID_ID_LABELS = URL_DRIVE + '1-P13Uomv9SeBQondNdqNIKsYgWVkpbrj'
TEST_ID_LABELS = URL_DRIVE + '1SRO5FxufHQcjGHFTXHuXy-M_xDAkqUWm'

TRAIN_ID_LABELS_FILE = 'dataset/train/train_id_labels.txt'
VALID_ID_LABELS_FILE = 'dataset/train/dev_id_labels.txt'
TEST_ID_LABELS_FILE = 'dataset/test/dev_id_labels.txt'

K = 6
SPLIT_SIZE = 3584 #512 sequences

In [3]:
!pip install gdown

In [12]:
import numpy as np 
import pandas as pd 
import gdown

import glob
import os

import collections
import matplotlib.pyplot as plt

In [5]:
os.makedirs('dataset', exist_ok=True)
os.makedirs('dataset/train', exist_ok=True)
os.makedirs('dataset/test', exist_ok=True)
os.makedirs('output', exist_ok=True)

In [6]:
gdown.download(G_TRAIN_FILE, TRAIN_FILE, quiet=False)
gdown.download(G_VALID_FILE, VALID_FILE, quiet=False)
gdown.download(G_TEST_FILE, TEST_FILE, quiet=False)

In [7]:
df = pd.read_csv(TRAIN_FILE, sep='\t', header=0)
df.drop(df.tail(1).index,inplace=True) # drop last n rows
df

In [31]:
def getCounter(seqId, df=df):
    seqX = df[df['id'] == seqId]
    outX = []
    for index,row in seqX.iterrows():
        kmers = row['sequence'].split()
        for kmer in kmers:
            outX.append(kmer)
    countX = collections.Counter(outX)
    return countX

def printMostFrequent(seqId, N=20, df=df):
    counter = getCounter(seqId, df)
    most_common = dict(counter.most_common(N))
    #print(most_common)
    
    names = list(most_common.keys())
    values = list(most_common.values())

    print("\nSeqID: " + str(seqId))
    print(str(N) +" most common kmers (total " + str(len(counter.keys())) + ")")
    plt.barh(range(N), values, tick_label=names)
    plt.show()

In [34]:
N = 20
X = 293
Y = 715

printMostFrequent(X)
printMostFrequent(Y)

In [None]:
x_axes = range(df['Release Year'].min(), df['Release Year'].max() + 1)
cross = pd.crosstab(df['IMDB Rating'], df['Release Year']).reindex(columns=years, fill_value=0)

fig, ax = plt.subplots(figsize=(30, 5))
sns.heatmap(cross, cbar_kws=dict(label='Count'), ax=ax)
ax.invert_yaxis()

In [16]:
gdown.download(TRAIN_ID_LABELS, TRAIN_ID_LABELS_FILE, quiet=False)

In [17]:
def load_id_labels(path):
  labels = []
  with open(path, 'r') as input:
    line = input.readline()
    while line:
      line = input.readline()
      splitted_line = line.split('\t')
      if len(splitted_line) < 2:
        break
      seq_id = splitted_line[0]
      label = int(splitted_line[1])
      labels.append(label)

  return labels

train_labels = load_id_labels(TRAIN_ID_LABELS_FILE)
print(len(train_labels))

In [22]:
labels = pd.DataFrame(train_labels, columns=['label'])
labels

In [23]:
labels[labels['label']==0]