In [1]:
import pandas as pd
import seaborn as sns
import plotly
import plotly.express as px
import plotly.graph_objects as go
plotly.offline.init_notebook_mode(connected=True)
import matplotlib.pyplot as plt
from scipy import stats
from matplotlib_venn import venn2, venn3
import os
import numpy as np

In [2]:
# Global options

sns.set_style("white")
sns.set(font_scale=2, style="ticks")

height=15
aspect=1
x_jitter=0
y_jitter=0

# xlim=(0,14)
# ylim=(-14, 10)
alpha=1
marker='.'
legend=True

# datasets

In [3]:
wago_1_ONLY_specific_path = "../../comparisons/list_of_blue_points_IP_WAGO_1_ONLY_specific.txt"
wago_1_ONLY_specific = pd.read_csv(wago_1_ONLY_specific_path, header=None)
wago_1_ONLY_specific.columns =[ "Name"]

In [4]:
wago_1_dpf_3_null_specific_path = "../../comparisons/list_of_red_points_IP_WAGO_1_dpf_3_null_specific.txt"
wago_1_dpf_3_null_specific = pd.read_csv(wago_1_dpf_3_null_specific_path, header=None)
wago_1_dpf_3_null_specific.columns =[ "Name"]

In [5]:
wago_1_path = "../../comparisons/list_of_other_color_IP_WAGO_1_specific.txt"
wago_1 = pd.read_csv(wago_1_path, header=None)
wago_1.columns =[ "Name"]

In [6]:
csr_1_ip_path = "../../04_csr_1_IP/results/filter/DE_htseq_count__alignment_sorted_filtered_unique_mappers__reverse__csr-1_input__csr-1_IP/DE_edgeR/final_table_FDR_low.tsv"
csr_1_ip = pd.read_csv(csr_1_ip_path, header=0, sep="\t")
csr_1_ip = pd.DataFrame(csr_1_ip[(csr_1_ip["FDR"]<0.05) & (csr_1_ip["logFC"]>0)]["id"]).copy()
csr_1_ip.columns = ["Name"]

# find how many times a gene overlaps with a repeat

In [7]:
%%bash

gtf="../../00_annotation/results/annotation/canonical_geneset.exons.gtf"
bed="../../00_annotation/results/annotation/ce_11_repeats.filtered.bed"

bedtools intersect \
-a $gtf \
-b $bed \
-s \
-c > genes_overlap_with_repeats.tsv

In [8]:
df = pd.read_csv("genes_overlap_with_repeats.tsv", header=None, sep="\t")
df.columns = ["seqname", "source", "feature", "start", "end", "score", "strand", "frame", "attribute", "counts"]

In [9]:
df["gene_id"] = pd.DataFrame(df["attribute"].str.split(";").str[0].str.replace("gene_id ", "").str.replace("\"", ""))
df["gene_biotype"] = pd.DataFrame(df["attribute"].str.split(";").str[4].str.replace("gene_biotype", "").str.replace("\"", ""))

In [10]:
df_counts = df[["gene_id", "counts"]].groupby("gene_id").sum()

In [11]:
genes_overlap_with_repeats = df_counts[df_counts["counts"]>0].index.to_list()

In [12]:
len(genes_overlap_with_repeats)

1226

# find how many times a repeat overlaps with a gene

In [13]:
%%bash

gtf="../../00_annotation/results/annotation/canonical_geneset.exons.gtf"
bed="../../00_annotation/results/annotation/ce_11_repeats.filtered.bed"

bedtools intersect \
-a $bed \
-b $gtf \
-s \
-c > repeats_overlap_with_genes.tsv

In [14]:
df = pd.read_csv("repeats_overlap_with_genes.tsv", header=None, sep="\t")
df.columns = ["seqname","start", "end", "repeat", "score", "strand", "counts"]
df_counts_collapsed = df[df["counts"]>0]

In [15]:
repeats_overlap_with_genes = \
list((df_counts_collapsed["seqname"].map(str)) \
+ ":" \
+ (df_counts_collapsed["start"].map(int)+1).map(str) \
+ "-" \
+ (df_counts_collapsed["end"].map(str)) \
+ ":" \
+ (df_counts_collapsed["strand"].map(str)))

In [16]:
len(repeats_overlap_with_genes)

1340

# Statistics for the 3 conditions

In [17]:
wago_1_ONLY_specific["annotation"] = "gene"
wago_1_ONLY_specific.loc[~wago_1_ONLY_specific.Name.str.startswith("WBGene"), "annotation"] = "repeat"
wago_1_ONLY_specific.loc[(
    wago_1_ONLY_specific["Name"].isin(genes_overlap_with_repeats) |
    wago_1_ONLY_specific["Name"].isin(repeats_overlap_with_genes)
), "annotation"] = "gene/repeat"

In [18]:
wago_1_ONLY_specific.annotation.value_counts()

gene           303
repeat          49
gene/repeat     26
Name: annotation, dtype: int64

In [19]:
wago_1_dpf_3_null_specific["annotation"] = "gene"
wago_1_dpf_3_null_specific.loc[~wago_1_dpf_3_null_specific.Name.str.startswith("WBGene"), "annotation"] = "repeat"
wago_1_dpf_3_null_specific.loc[(
    wago_1_dpf_3_null_specific["Name"].isin(genes_overlap_with_repeats) |
    wago_1_dpf_3_null_specific["Name"].isin(repeats_overlap_with_genes)
), "annotation"] = "gene/repeat"

In [20]:
wago_1_dpf_3_null_specific.annotation.value_counts()

gene           447
gene/repeat     13
repeat           9
Name: annotation, dtype: int64

In [21]:
wago_1["annotation"] = "gene"
wago_1.loc[~wago_1.Name.str.startswith("WBGene"), "annotation"] = "repeat"
wago_1.loc[(
    wago_1["Name"].isin(genes_overlap_with_repeats) |
    wago_1["Name"].isin(repeats_overlap_with_genes)
), "annotation"] = "gene/repeat"

In [22]:
wago_1.annotation.value_counts()

gene           2742
repeat          329
gene/repeat     205
Name: annotation, dtype: int64

# Statistics for csr-1

In [23]:
csr_1_ip["annotation"] = "gene"
csr_1_ip.loc[~csr_1_ip.Name.str.startswith("WBGene"), "annotation"] = "repeat"
csr_1_ip.loc[(
    csr_1_ip["Name"].isin(genes_overlap_with_repeats) |
    csr_1_ip["Name"].isin(repeats_overlap_with_genes)
), "annotation"] = "gene/repeat"

In [24]:
csr_1_ip.annotation.value_counts()

gene           3910
gene/repeat     156
repeat           56
Name: annotation, dtype: int64