In [1]:
import pandas as pd 
import altair as alt 
import os 
from glob import glob

import numpy as np

In [2]:
all_df = []
for file in glob("results/*demux.txt"):
    demux_df = pd.read_csv(f"{file}", sep=",")
    # First columns contains filename.... I missed the name 
    demux_df = demux_df.rename({demux_df.columns[0]: "filename"}, axis=1)
    
    demux_df["well"] = demux_df[demux_df.columns[0]].str.split(".", expand=True)[0]
    #Create pool name 
    demux_df["pool"] = os.path.basename(file).replace(".demux.txt","")
    demux_df["pool_name"] = demux_df["pool"].apply(lambda x: x.split("-")[0] if "-" in x else x)

    # Join with barcode info 
    demux_df = demux_df.set_index("well")

    barcode_file = file.replace(".demux.txt", ".barcode.txt")
    barcode_df = pd.read_csv(barcode_file, sep="\t", header=None).set_index(0)
    barcode_df.columns = ["barcode","statut", "comment"]
    demux_df = demux_df.join(barcode_df)    
    
    # append to all 
    all_df.append(demux_df)
    
all_df = pd.concat(all_df).reset_index(drop=False)
all_df.to_csv("all_results.txt") 

all_df

Unnamed: 0,well,filename,TOTAL,FORWARD_PRIMER_N1,FORWARD_PRIMER_N2,FORWARD_PRIMER_HUMAN,N1,N2,HUMAIN,SPIKE_N1,...,pN2,pH,pSN1,pSN2,pSH,pool,pool_name,barcode,statut,comment
0,A1,A1.fastq,19574,10576,5761,191,11310,4765,16,350,...,0.243435,0.000817,0.017881,0.020231,0.006335,191,191,CCCGTAGTTATTTAT,pos,Adil10
1,B1,B1.fastq,5960,2486,1810,852,1609,811,48,857,...,0.136074,0.008054,0.143792,0.085403,0.104866,191,191,TCGACCACTACTAAT,pos,Adil100
2,C1,C1.fastq,1789,245,925,459,56,165,18,162,...,0.092230,0.010061,0.090553,0.232532,0.200671,191,191,TAGTTTATCTTAAAT,pos,Adil1000
3,D1,D1.fastq,720,154,327,161,36,59,16,103,...,0.081944,0.022222,0.143056,0.198611,0.162500,191,191,GTTAGTATGATCTAT,pos,Adil10000
4,E1,E1.fastq,2022,286,1102,460,59,144,26,193,...,0.071217,0.012859,0.095450,0.266568,0.173096,191,191,TTTAGATGTACTATT,pos,Adil100000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
331,D3,D3.fastq,54420,23637,19975,6785,9467,10709,1163,14174,...,0.196784,0.021371,0.260456,0.168008,0.096564,193-702,193,AAACTATCTAATTAT,pos,A100
332,E3,E3.fastq,44577,6634,28803,6237,2421,16047,1119,4159,...,0.359984,0.025103,0.093299,0.280728,0.107455,193-702,193,TTAATTGATAGTTAT,pos,A100
333,F3,F3.fastq,38055,15439,14992,4881,1979,5189,371,12995,...,0.136355,0.009749,0.341479,0.248929,0.112048,193-702,193,CTCCTATCGATAAAT,pos,B100
334,G3,G3.fastq,32777,8176,19462,3013,1826,10004,302,6231,...,0.305214,0.009214,0.190103,0.283644,0.078744,193-702,193,TTTGATATTTATTAT,pos,B100


In [3]:
size_df = []
for file in glob("193*/out.hist"):
    df = pd.read_csv(file, sep="\t", header=None)
    df["name"] = os.path.dirname(file)
    df.columns = ["size", "count", "name"]
    size_df.append(df)

size_df = pd.concat(size_df)
alt.Chart(size_df).mark_line().encode(x="size", y="count", facet="name")

In [4]:
all_df["C1"] = np.log(all_df["N1"] / (all_df["SPIKE_N1"]))
all_df["C2"] = np.log(all_df["N2"] / (all_df["SPIKE_N2"]))

all_df["F1"] = all_df["FORWARD_PRIMER_N1"] / (all_df["TOTAL"])
all_df["F2"] = all_df["FORWARD_PRIMER_N2"] / (all_df["TOTAL"])

chart1 = alt.Chart(all_df).mark_point().encode(
    x=alt.X("C1", axis = alt.Axis(title="log(N1/SPIKE_N1)")),
    y=alt.Y("C2", axis = alt.Axis(title="log(N2/SPIKE_N2)")),
    color="statut",
    tooltip="comment"
).transform_filter(alt.datum.pool_name == '193')

chart2 = alt.Chart(all_df).mark_point().encode(
    x=alt.X("F1", axis = alt.Axis(title="Forward N1 %")),
    y=alt.Y("F2", axis = alt.Axis(title="Forward N2 %")),
    color="statut",
    tooltip="comment"
).transform_filter(alt.datum.pool_name == '193')


chart1 | chart2


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [5]:
chart1 = alt.Chart(all_df).mark_bar().encode(
    x=alt.X("well:N"),
    y=alt.Y("sum(TOTAL)"),
    color="pool:N"
).transform_filter(
    alt.datum.pool_name == "193"
)

chart1

In [6]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC

df = all_df.query("pool.str.contains('193')", engine="python").reset_index(drop=True).copy()
df["statut"] = df["statut"].replace("0", "neg").replace("inconnu","neg").map({"pos":1, "neg": 0})



features = df[["TOTAL","FORWARD_PRIMER_N1","FORWARD_PRIMER_N2", "N1","N2", "HUMAIN", "SPIKE_HUMAN", "SPIKE_N1", "SPIKE_N2"]].copy()

target = df["statut"].copy()


model = make_pipeline( LogisticRegression(max_iter=1000))

model.fit(features, target)
print( "Accurency", model.score(features, target))

plot_confusion_matrix(model, features, target)

predicted = model.predict(features)
predicted_proba = model.predict_proba(features)


features["observed"] = target 
features["predicted"] = predicted 
features["predicted_proba"] = predicted_proba[:,1] * 100 

features["well"] = df["well"]
features["comment"] = df["comment"]

features["match"] = features["predicted"] == features["observed"]

alt.Chart(features).mark_point().encode(
    x="x:Q", y="y:Q", color="match:N", tooltip="comment"
).transform_calculate(
    x='datum.N1/datum.SPIKE_N1',
    y='datum.N2/datum.SPIKE_N2',
).interactive()


Accurency 0.90625


In [8]:
all_df = []
for file in glob("193*/demux/*.hist"):
    df = pd.read_csv(file, sep="\s+", header=None)
    df["pool"] = os.path.dirname(file).split("/")[0]
    df["name"] = os.path.basename(file).replace(".hist","")
    df.columns = ["size","read_count", "pool", "name"]
   
    all_df.append(df)
    
all_df = pd.concat(all_df)

all_df

Unnamed: 0,size,read_count,pool,name
0,130,1,193-701,A1
1,131,3,193-701,A1
2,132,4,193-701,A1
3,133,29,193-701,A1
4,134,192,193-701,A1
...,...,...,...,...
60,83,2723,193-702,H3
61,84,23,193-702,H3
62,85,2,193-702,H3
63,88,1,193-702,H3


In [30]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets


def plot_well_size(name):
    select = all_df[(all_df["pool"].str.contains("701")) & (all_df["name"] == name)]
    return alt.Chart(select, title=name).mark_line().encode(x="size", y="read_count").properties(width=500, height=150).interactive()
  
    
interact(plot_well_size, name=widgets.Dropdown(options=all_df["name"].unique()))


interactive(children=(Dropdown(description='name', options=('A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C…

<function __main__.plot_well_size(name)>