# Introduction

Brian asked for RINs for long read libraries and provided a spreadsheet with some, but not all of them.

In [1]:
import pandas
from io import StringIO
from pathlib import Path
import sys
from tqdm import tqdm

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client.encoded import ENCODED

server = ENCODED("www.encodeproject.org")

In [3]:
sheet = pandas.read_excel("RIN hunt long read libraries.xlsx")
sheet.head()

Unnamed: 0,index,ENCODE_experiment_id,species,dataset,sample_display,tissue_or_cell_line,document_urls,platform,Unnamed: 8,RIN,...,ENCODE_experiment_id.1,species.1,dataset.1,sample_display.1,age,sex,genotype,tissue_or_cell_line.1,platform.1,RIN.1
0,59,ENCSR834DQL,human,hepg2_2_1,HepG2,cell_line,https://www.encodeproject.org/documents/7ec9d6...,Pacific Biosciences Sequel,,,...,ENCSR172GXL,mouse,f1219_1_1,F121-9,,f,129/sv/cast,cell_line,Pacific Biosciences Sequel II,
1,60,ENCSR834DQL,human,hepg2_2_2,HepG2,cell_line,https://www.encodeproject.org/documents/7ec9d6...,Pacific Biosciences Sequel,,,...,ENCSR172GXL,mouse,f1219_1_2,F121-9,,f,129/sv/cast,cell_line,Pacific Biosciences Sequel II,
2,86,ENCSR589FUJ,human,k562_2_1,K562,cell_line,https://www.encodeproject.org/documents/7ec9d6...,Pacific Biosciences Sequel,,,...,ENCSR172GXL,mouse,f1219_1_3,F121-9,,f,129/sv/cast,cell_line,Pacific Biosciences Sequel II,
3,87,ENCSR589FUJ,human,k562_2_2,K562,cell_line,https://www.encodeproject.org/documents/7ec9d6...,Pacific Biosciences Sequel,,,...,ENCSR418ZYU,mouse,c2c12_myoblast_2_1,C2C12,,f,c3h,cell_line,Pacific Biosciences Sequel,
4,52,ENCSR575LWI,human,heart_left_ventricle_2_1,Heart,tissue,https://www.encodeproject.org/documents/0a54a6...,Pacific Biosciences Sequel II,,,...,ENCSR418ZYU,mouse,c2c12_myoblast_2_2,C2C12,,f,c3h,cell_line,Pacific Biosciences Sequel,


In [4]:
exp = server.get_experiment("ENCSR834DQL")


In [5]:
exp.replicates[0].library.get("rna_integrity_number")

In [6]:
exp.replicates[0].library["biosample"]["accession"]

'ENCBS667GKM'

In [7]:
list(exp.replicates[0].keys())

['library',
 'submitted_by',
 'technical_replicate_number',
 'experiment',
 'status',
 'schema_version',
 'date_created',
 'aliases',
 'biological_replicate_number',
 '@id',
 '@type',
 'uuid',
 'files']

In [8]:
graph = server.search_jsonld(searchTerm="ENCBS667GKM")

In [9]:
for row in graph["@graph"]:
    print(row["accession"])

ENCBS667GKM
ENCSR834DQL


In [10]:

accessions = set(sheet["ENCODE_experiment_id"].unique())


In [11]:
experiment_cache = {}
biosamples = {}
accessions = set(sheet["ENCODE_experiment_id"].unique())
while len(accessions) > 0:
    experiment_id = accessions.pop()
    if experiment_id not in experiment_cache:
        experiment = server.get_experiment(experiment_id)
        experiment_cache[experiment_id] = experiment

        for replicate in experiment.replicates:
            if len(replicate.library["aliases"]) > 1:
                print("{} many aliases handle it: {}".format(experiment_id, replicate.library["aliases"]))

            library_id = replicate.library["accession"]
            alias = replicate.library["aliases"][0]
            rin = replicate.library.get("rna_integrity_number")
            biosample_id = replicate.library["biosample"]["accession"]
            bio_rep = replicate["biological_replicate_number"]
            tech_rep = replicate["technical_replicate_number"]
            
            # add related experiments
            graph = server.search_jsonld(searchTerm=biosample_id)
            for row in graph["@graph"]:
                if row["@type"][0] == "Experiment" and row["accession"] not in experiment_cache:
                    accessions.add(row["accession"])

            biosamples.setdefault(biosample_id, []).append((experiment_id, biosample_id, bio_rep, tech_rep, library_id, alias, rin))
    print(len(accessions), len(experiment_cache), len(biosamples))

37 1 3
46 2 6
45 3 8
55 4 10
62 5 13
61 6 13
62 7 15
62 8 16
63 9 17
62 10 17
61 11 17
60 12 19
61 13 20
60 14 22
61 15 23
60 16 23
59 17 23
58 18 23
57 19 23
58 20 24
59 21 25
58 22 25
57 23 25
56 24 25
55 25 25
54 26 25
53 27 25
54 28 26
55 29 27
54 30 29
55 31 30
54 32 30
53 33 30
52 34 32
53 35 33
52 36 33
51 37 33
50 38 33
49 39 33
59 40 35
69 41 37
68 42 37
67 43 37
66 44 37
65 45 37
64 46 37
63 47 41
73 48 43
72 49 43
71 50 43
70 51 43
69 52 43
68 53 43
67 54 43
66 55 43
65 56 43
64 57 43
63 58 43
62 59 43
63 60 44
62 61 44
61 62 44
60 63 44
59 64 44
58 65 44
59 66 45
58 67 45
57 68 45
67 69 47
66 70 47
65 71 47
64 72 47
63 73 47
62 74 47
61 75 50
60 76 50
59 77 50
58 78 50
57 79 50
56 80 50
56 81 51
63 82 54
64 83 55
74 84 57
73 85 57
74 86 58
73 87 58
72 88 58
71 89 58
70 90 58
69 91 60
68 92 60
67 93 60
66 94 60
65 95 60
ENCSR309IKK many aliases handle it: ['ali-mortazavi:D210', 'roderic-guigo:D210']
ENCSR309IKK many aliases handle it: ['ali-mortazavi:D211', 'roderic-guigo:D2

In [13]:
biosamples

{'ENCBS434FZQ': [('ENCSR470HYQ',
   'ENCBS434FZQ',
   1,
   1,
   'ENCLB175ASW',
   'ali-mortazavi:PB308_h9_chondrocytes',
   None),
  ('ENCSR153MHB',
   'ENCBS434FZQ',
   1,
   1,
   'ENCLB689NVY',
   'ali-mortazavi:WA09-chondrocytes-miRNAseq-ENC4-209',
   None),
  ('ENCSR774MGO',
   'ENCBS434FZQ',
   1,
   1,
   'ENCLB492DNB',
   'barbara-wold:22576',
   9)],
 'ENCBS524YKH': [('ENCSR470HYQ',
   'ENCBS524YKH',
   2,
   1,
   'ENCLB165ZXK',
   'ali-mortazavi:PB309_h9_chondrocytes',
   None),
  ('ENCSR153MHB',
   'ENCBS524YKH',
   2,
   1,
   'ENCLB183FIS',
   'ali-mortazavi:WA09-chondrocytes-miRNAseq-ENC4-210',
   None),
  ('ENCSR774MGO',
   'ENCBS524YKH',
   2,
   1,
   'ENCLB706JRN',
   'barbara-wold:22577',
   9)],
 'ENCBS668AFC': [('ENCSR470HYQ',
   'ENCBS668AFC',
   3,
   1,
   'ENCLB201KID',
   'ali-mortazavi:PB310_h9_chondrocytes',
   None),
  ('ENCSR153MHB',
   'ENCBS668AFC',
   3,
   1,
   'ENCLB671JIW',
   'ali-mortazavi:WA09-chondrocytes-miRNAseq-ENC4-211',
   None),
  ('ENC

In [17]:
results = []
for key in biosamples:
    results.extend(biosamples[key])
    
results = pandas.DataFrame(results, columns=["experiment", "biosample", "biorep", "techrep", "library", "alias", "rin"])
results = results[["biosample", "experiment", "library", "biorep", "techrep", "alias", "rin"]]

In [18]:
results.to_excel('/dev/shm/rins.xlsx', index=False)