In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import pybedtools
from itertools import combinations
import statistics
import seaborn as sns
import seaborn as sns
import matplotlib.pyplot as plt
import statistics


In [2]:
#document with chromosome lengths in bases for human GRCh37. Found on this website: https://www.ncbi.nlm.nih.gov/grc/human/data?asm=GRCh37#:~:text=Chromosome%2C%201%2C%202%2C%203%2C%204%2C%20Total%20length,191%2C154%2C276%2C%20GenBank%20accession%2C%20CM000663.1%2C%20CM000664.1%2C%20CM000665.1%2C%20CM000666.1%2
chrom_df = pd.read_csv("/Users/eugeniaampofo/Downloads/Downloads/Sprime_files/Jupyter_Vis/Jupyter/Human_GRCH37_Chromosome_bases_lenghts.csv") 
chrom_df.rename(columns={'Chromosome': 'CHROM', "Total length (bp)": "Total_len_bp"}, inplace=True)
# focus only on autosomal chromosomes
chrom_df = chrom_df.loc[(chrom_df["CHROM"] != "X") & (chrom_df["CHROM"] != "Y")] 
chrom_df["Total_len_bp"] = chrom_df["Total_len_bp"].str.replace(',', '').astype(int)
chrom_df

FileNotFoundError: [Errno 2] No such file or directory: '/Users/eugeniaampofo/Downloads/Downloads/Sprime_files/Jupyter_Vis/Jupyter/Human_GRCH37_Chromosome_bases_lenghts.csv'

In [None]:
total_len = sum(chrom_df["Total_len_bp"].to_numpy())/100   #for genome proportion calculation 

In [None]:
sprime_files = "/Users/eugeniaampofo/Downloads/Downloads/Sprime_files/Sprime_res/" #location of Sprime Outputs
populations = []

for file in os.listdir(sprime_files): #finding names of populations
    if "_sprime_results.tar.gz" in file:
        name = file.replace("_sprime_results.tar.gz" ,"")
        populations.append(name)
populations = list(set(populations))
populations = sorted(populations)
populations

In [None]:

#Generate all pairs of populations 
population_pairs = list(combinations(populations, 2))
population_pairs = [sorted(list(x)) for x in population_pairs]
population_pairs


In [None]:
def sum_seg(df): #calculates segment length
    k = df["END"].to_numpy() - df["START"].to_numpy()
    return np.sum(k)

def sum_seg2(df): #calculates segment length
    lens = []
    for s in df.START.unique():
        e = df.loc[df["START"] == s]
        k = e["END"].to_numpy() - e["START"].to_numpy()
        lens.append(k)
    return k

In [None]:
sprime_files = "/Users/eugeniaampofo/Downloads/Downloads/Sprime_files/Sprime_res/mendeley_data/" #Sprime outputs
segment_lens = {} #tracks genome coverage of introgressed segments that are also have neanderthal/denisovan info 
adjusted_lens = {} # ^ + dealing with tiling by using median/mean
for pop in populations:
    seg_len = 0 #track of total genome coverage for chromosome
    seg_p = [] #track of each ch
    for i in range(1,23):
        for file in os.listdir(sprime_files):
            if pop in file and f"chr{i}." in file:
                df= pd.read_csv(sprime_files + file, sep="\t") 
                df["CHROM"] = df["CHROM"].astype(int)
                df["POS"] = df["POS"].astype(int)
                df["SEGMENT"] = df["SEGMENT"].astype(int)
                df_sorted = df.sort_values(by=["CHROM", "POS", "SEGMENT"], ascending=True)
                # print(df)
                df = df.loc[df["SCORE"] >= 150000] 
                df = df.loc[(df["NMATCH"] != "notcomp") | (df["DMATCH"] != "notcomp")] #filter out sites wit notcomp for both Nean and Deni
                df_sorted = df.sort_values(by=["CHROM", "POS", "SEGMENT"], ascending=True)
                df2 = df_sorted[["CHROM", "POS", "SEGMENT"]]
                # Convert DataFrame to BedTool object
                bed_df = pybedtools.BedTool.from_dataframe(df2)
                # Group by segment number and find min/max position
                result = bed_df.groupby(g=3, c=[2], o=['min', 'max'])
                # Convert result to DataFrame
                result_df = result.to_dataframe(names=['SEGMENT', 'START', 'END'])
                # Group by the 'SEGMENT' column and find min/max values for other columns
                result_df = result_df.groupby('SEGMENT').agg({'START': 'min', 'END': 'max'}).reset_index()
                # Merge df1 and df2 based on the shared column
                merged_df = pd.merge(df_sorted, result_df, on='SEGMENT', how='left')
                df1 = merged_df.drop_duplicates(subset=['CHROM', 'START', 'END'])
                # Create a Pybedtools object from the BED content
                df5 = df1[['CHROM', 'START', 'END']]
                df2 = pybedtools.BedTool.from_dataframe(df5)
                df2 = df2.merge()
                df2 = df2.to_dataframe(names=['CHROM', 'START', 'END'])
                f = sum_seg(df2)
                f2 = sum_seg2(df2)
                seg_len += f
                seg_p.extend(f2)
                # seg_p.append(f)
    if pop  == "Papuans":  
        adjusted_lens[pop] = int(seg_len)
    else:
        median = statistics.median(seg_p)
        adjusted_lens[pop] = int(median)
    segment_lens[pop] = seg_len
            
df = pd.DataFrame.from_dict(adjusted_lens, orient='index', columns=['Genome covered by detected segments in bases'])
df["Proportion with covered by detected segment (%)"] = df['Genome covered by detected segments in bases']/total_len 
df = df.sort_values(by="Genome covered by detected segments in bases", ascending=False)
df 
                   


In [None]:
 def add_on(df1, df2):
    df1['START'] = None
    df1['END'] = None
    # print(df1)
    # print("dfldjfkj")
    # Iterate over each row in df1
    start_end_list = list(zip(df2['START'], df2['END']))
    match_found = False

    for index1, row1 in df1.iterrows():
        # Iterate over each row in df2
        for start, end in start_end_list:
            # Check if POS falls between START and END
            if start <= row1['POS'] <= end:
                # Set START and END values from df2 to df1
                df1.at[index1, 'START'] = start
                df1.at[index1, 'END'] = end
                break  # Break the inner loop if condition is satisfied
# def add_on2(df1,df2):
#     df1['START'] = None
#     df1['END'] = None
#     # print(df1)
#     # print("dfldjfkj")
#     # Iterate over each row in df1
#     start_end_list = list(zip(df2['START'], df2['END']))
#     match_found = False
#     indices_to_delete = []
#     for index1, row1 in df1.iterrows():
#         # Iterate over each row in df2
#         for start, end in start_end_list:
#             # Check if POS falls between START and END
#             if row2['START'] <= row1['POS'] <= row2['END']:
#                 # Set START and END values from df2 to df1
#                 df1.at[index1, 'START'] = row2['START']
#                 df1.at[index1, 'END'] = row2['END']
#                 match_found = True
#                 break  # Break the inner loop if condition is satisfied
#         if not match_found:
#             indices_to_delete.append(index1)

#     df1 = df1.drop(indices_to_delete)
           


In [None]:
pop_chrom_dict = {}
for i in range(1,23):
    pop_chrom_dict[i] = {}
    for pop in populations:
        for file in os.listdir(sprime_files):
            if pop in file and f"chr{i}." in file:
                # print(pop)
                df= pd.read_csv(sprime_files + file, sep="\t")
                df["CHROM"] = df["CHROM"].astype(int)
                df["POS"] = df["POS"].astype(int)
                df["SEGMENT"] = df["SEGMENT"].astype(int)
                df = df.loc[df["SCORE"] >= 150000]
                df = df.loc[(df["NMATCH"] != "notcomp") | (df["DMATCH"] != "notcomp")]
                df_sorted = df.sort_values(by=["CHROM", "POS", "SEGMENT"], ascending=True)
                df2 = df_sorted[["CHROM", "POS", "SEGMENT"]]
                # Convert DataFrame to BedTool object
                bed_df = pybedtools.BedTool.from_dataframe(df2)
                # Group by segment number and find min/max position
                result = bed_df.groupby(g=3, c=[2], o=['min', 'max'])
                # Convert result to DataFrame
                result_df = result.to_dataframe(names=['SEGMENT', 'START', 'END'])
                # Group by the 'SEGMENT' column and find min/max values for other columns
                result_df = result_df.groupby('SEGMENT').agg({'START': 'min', 'END': 'max'}).reset_index()
                # Merge df1 and df2 based on the shared column
                merged_df = pd.merge(df_sorted, result_df, on='SEGMENT', how='left')
                df1 = merged_df.drop_duplicates(subset=['CHROM', 'START', 'END'])
                # print(df1)
                # print("hellodf")
                # Create a Pybedtools object from the BED content
                df5 = df1[['CHROM', 'START', 'END']]
                # print(df5)
                df2 = pybedtools.BedTool.from_dataframe(df5)
                df2 = df2.merge()
                df2 = df2.to_dataframe(names=['CHROM', 'START', 'END'])
                # print(df2)
                # print(pop)
                add_on(merged_df, df2)
                # print(df2)
                # print(pop)
                # print(lodlkfj)
                pop_chrom_dict[i][pop] = merged_df
                # print(i)
                # print(pop)
           


In [None]:
pop_chrom_dict[1].keys()

In [None]:
# pop_chrom_dict[1]
pop_chrom_dict[1]["Papuans"]

In [None]:
# k =pop_chrom_dict[1]["Papuans"]
# k = k.loc[k["DMATCH"] == "notcomp"]
# k

In [None]:
concatenated_dict = {}

# Iterate over each unique name
for name in set(name for chrom_dict in pop_chrom_dict.values() for name in chrom_dict.keys()):
    # Collect DataFrames associated with the current name across chromosomes
    dfs = [chrom_dict[name] for chrom_dict in pop_chrom_dict.values() if name in chrom_dict]
    
    # Concatenate DataFrames into a single DataFrame
    concatenated_df = pd.concat(dfs, ignore_index=True)
    concatenated_df = concatenated_df.sort_values(by=["CHROM", "POS"], ascending=True)
    
    # Store the concatenated DataFrame in the new dictionary
    concatenated_dict[name] = concatenated_df
concatenated_dict["CLM"]
# concatenated_dict["Papuans"]

In [None]:
k = concatenated_dict["PUR"]
print(k)
# print(k.SEGMENT.unique())
# print(sorted(k.START.unique()))
# print(len(set(sorted(k.END.unique()))))

In [None]:
def add_on2(df1,df2):
    df1['START'] = None
    df1['END'] = None
    # print(df1)
    # print("dfldjfkj")
    # Iterate over each row in df1
    start_end_list = list(zip(df2['START'], df2['END']))
    match_found = False
    indices_to_delete = []
    for index1, row1 in df1.iterrows():
        # Iterate over each row in df2
        for start, end in start_end_list:
            # Check if POS falls between START and END
            if start <= row1['POS'] <= end:
                # Set START and END values from df2 to df1
                df1.at[index1, 'START'] = start
                df1.at[index1, 'END'] = end
                match_found = True
                break  # Break the inner loop if condition is satisfied
        if not match_found:
            indices_to_delete.append(index1)

    df1 = df1.drop(indices_to_delete)

In [None]:
ov_dict = {}
population_pairs_tuples = [tuple(pair) for pair in population_pairs]

# Initialize the dictionary using population_pairs_tuples
ov_dict_more_info = {k: {} for k in population_pairs_tuples}
# print(ov_dict_more_info)
# ov_dict_more_info = {k: {} for k in population_pairs}
# Iterate over chromosomes
lst = ["Papuans", "PEL", "MXL", "CLM"]

for pop in populations:
    count = 0
    # print(pop)
    k = [x for x in population_pairs if pop in x]
    # print(k)
    df11 = concatenated_dict[pop]
    df1 = df11.sort_values(by=["CHROM", "START"], ascending=True)
    # print(df1)
    df1 = df1.drop_duplicates(subset=['CHROM', 'START', 'END'])
    df1 = df1[["CHROM", "START", "END"]]
    # print(df1)
    df1 = pybedtools.BedTool.from_dataframe(df1)
    # print(pop)
    for q in k:
        # print(q)
        # print(dlfjkdljf)
        ov_name = "-".join(q)
        if ov_name not in ov_dict.keys():
            pair = q.copy()
            # print(pair)
            pair.remove(pop)
            # print(pair)
            df22 = concatenated_dict[pair[0]]
            df2 =df22.drop_duplicates(subset=['CHROM', 'START', 'END'])
            df2 = df2.sort_values(by=["CHROM", "START"], ascending=True)
            # print(df2)
            df2 = df2[["CHROM", "START", "END"]]
            df2 = pybedtools.BedTool.from_dataframe(df2)
            intersected_bed= df1.intersect(df2).sort().merge()
            ov_df = intersected_bed.to_dataframe(names=['CHROM', 'START', 'END'])#intersected_df
            # print(df11)
            # add_on2(df11, ov_df)
            # add_on2(df22, ov_df)
            # print(df1)
            # print(dlfjldfjk)
            # print(ov_df)
            # print(dlfkjdlfj)
            ov_dict[ov_name] = ov_df.sort_values(by=["CHROM", "START"], ascending=True)
            # print(df11)
            if pop in lst and pair[0] in lst:
                ov_dict_more_info[tuple(q)][pop] = df11
                # print(df11)
                # print(ov_dict_more_info[ov_name][pop] )
                # print(lol)
                ov_dict_more_info[tuple(q)][pair[0]] = df22
                ov_dict_more_info[tuple(q)][ov_name] = ov_df
            count += 1
            del df2 
ov_dict['CHB-ITU']


In [None]:
ov_dict_more_info.keys()


In [None]:
ov_dict_more_info[("CLM", "Papuans")]

In [None]:
def add_on2(df1,df2):
    df1['START'] = None
    df1['END'] = None
    # print(df1)
    # print("dfldjfkj")
    # Iterate over each row in df1
    start_end_list = list(zip(df2['START'], df2['END']))
    match_found = False
    indices_to_delete = []
    for index1, row1 in df1.iterrows():
        # Iterate over each row in df2
        for start, end in start_end_list:
            # Check if POS falls between START and END
            if start <= row1['POS'] <= end:
                # Set START and END values from df2 to df1
                df1.at[index1, 'START'] = start
                df1.at[index1, 'END'] = end
                match_found = True
                break  # Break the inner loop if condition is satisfied

    df1 = df1.dropna()    

In [None]:
df1 = ov_dict_more_info[("CLM", "Papuans")]["CLM"]
df2 = ov_dict_more_info[("CLM", "Papuans")]["Papuans"]
ov = ov_dict_more_info[("CLM", "Papuans")]["CLM-Papuans"]
add_on2(df1, ov)
add_on2(df2, ov)
df1

In [None]:
df1 = df1.dropna()  
df1

In [None]:
df2 = df2.dropna()    
df2

In [None]:
population_p2 = [tuple(pair) for pair in population_pairs if pair[0] in lst and pair[1] in lst]
population_p2

In [None]:
def addy(k):
    df1 = ov_dict_more_info[k][k[0]]
    df2 = ov_dict_more_info[k][k[1]]
    ov = ov_dict_more_info[k]['-'.join(k)]
    add_on2(df1, ov)
    add_on2(df2, ov)
    df1 = df1.dropna()  
    df2 = df2.dropna()  
    concat_df = pd.concat([df1, df2])
    dedup_df = concat_df.drop_duplicates()
    return dedup_df

concati = {}
for p in population_p2:
    r = addy(p)
    concati[p] = r  

In [None]:
def calc(h):
    z = concati[h]
    nmatch = (z['NMATCH'] == 'match').sum()
    dmatch = (z['DMATCH'] == 'match').sum()
    nmis = (z['NMATCH'] == 'mismatch').sum()
    dmis = (z['DMATCH'] == 'mismatch').sum()
    n= nmatch/(nmatch + nmis)
    d= dmatch/(dmatch + dmis)
    return n, d

def calc2(h):
    m = concati[h]
    Nean = []
    Deni = []
    for chrom in m.CHROM.unique():
        n = m.loc[m["CHROM"] == chrom] 
        q = n["START"].unique()
        for seg in q:
            z = n.loc[n["START"] == seg]
            nmatch = (z['NMATCH'] == 'match').sum()
            dmatch = (z['DMATCH'] == 'match').sum()
            nmis = (z['NMATCH'] == 'mismatch').sum()
            dmis = (z['DMATCH'] == 'mismatch').sum()
            Nean.append(nmatch/(nmatch + nmis))
            Deni.append(dmatch/(dmatch + dmis))
    return Nean, Deni

        
nean = [calc(x)[0] for x in population_p2]
deni = [calc(x)[1] for x in population_p2]
# nean    

nean_d = {x:calc2(x)[0] for x in population_p2}
deni_d = {x: calc2(x)[1] for x in population_p2}
# nean_d
population_p2


In [None]:
nean

In [None]:
deni

In [None]:
# original_list = [0.52, 0.54, 0.56, 0.5800000000000001, 0.6, 0.62, 0.64, 0.66, 0.68, 0.7, 0.72]

# doubled_list = []
# for i in range(len(original_list) - 1):
#     doubled_list.append(original_list[i])
#     doubled_list.append((original_list[i] + original_list[i+1]) / 2)

# doubled_list.append(original_list[-1])

# print(doubled_list)
original_list = [0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29000000000000004, 0.3, 0.31, 0.32]


doubled_list = [original_list[0]]
for i in range(len(original_list) - 1):
    # Calculate the half step between current and next elements
    half_step = (original_list[i+1] - original_list[i]) / 2
    # Add the current element and the half step
    doubled_list.append(original_list[i] + half_step)
    doubled_list.append(original_list[i+1])

print(doubled_list)



In [None]:

# Sample data
names = population_p2
print(names)

# Create the plot
plt.figure(figsize=(8, 5))
plt.scatter(nean, deni)

# Annotate each point with its corresponding name
for name, x, y in zip(names, nean, deni):
    plt.annotate(name, (x, y), textcoords="offset points", xytext=(2, 2), ha='center')
# Get current tick marks
current_xticks = plt.xticks()[0]
current_yticks = plt.yticks()[0]
# print(current_yticks)
t = [0.02*x for x in range(1, 2)]
tt = [0.01*x for x in range(1, 2)]
# print(t)
# print(len(t))
y = [x + 0.7 for x in t]
yy = [x + 0.31 for x in tt]
# print(y)
# print(len(y))
q = [x for x in current_xticks]
qq = [x for x in current_yticks]

# print(q)
q.extend(y)
qq.extend(yy)
# print(q)
# print(qq)
# q = [0.52, 0.53, 0.54, 0.55, 0.56, 0.5700000000000001, 0.5800000000000001, 0.5900000000000001, 0.6, 0.61, 0.62, 0.63, 0.64, 0.65, 0.66, 0.67, 0.68, 0.69, 0.7, 0.71, 0.72]
# qq = [0.22, 0.225, 0.23, 0.235, 0.24, 0.245, 0.25, 0.255, 0.26, 0.265, 0.27, 0.275, 0.28, 0.28500000000000003, 0.29000000000000004, 0.29500000000000004, 0.3, 0.305, 0.31, 0.315, 0.32]

# # Set custom tick marks for both x and y axes
plt.xticks(q)
plt.yticks(qq)
# Show plot
plt.grid(True)
plt.title('Neanderthal vs Denisovan Proportions with amongst population pairs of interest')
plt.show()


In [None]:


# Create subplots
fig, axs = plt.subplots(2, 3, figsize=(15, 10))

# Plot KDE plots on each subplot
for i, ax in enumerate(axs.flatten()):
    Neanderthal = nean_d[population_p2[i]]
    Denisovan = deni_d[population_p2[i]]
    # sns.kdeplot(x=Neanderthal, y=Denisovan, linestyles=":", n_levels=22, alpha=0.6)
    # sns.kdeplot(x=Neanderthal, y=Denisovan, cbar=True, cmap='Blues', n_levels=22, shade=True, thresh=0)
    sns.kdeplot(x=Neanderthal, y=Denisovan, cbar=True, cmap='Blues', n_levels=22, fill=True, thresh=0, ax=ax)

    # sns.kdeplot(x=Neanderthal, y=Denisovan, linestyles=":", n_levels=22, alpha=0.6, cmap='Blues', ax=ax)
    ax.set_title(f'{"-".join(population_p2[i])} proportion')
    ax.set_xlabel('Neanderthal proportion')
    ax.set_ylabel('Denisovan proportion')

plt.tight_layout()
plt.show()


In [None]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Sample data for neanderthal and denisovan
# nean = [0.1, 0.3, 0.4, 0.5, 0.6]
# deni = [0.2, 0.3, 0.4, 0.5, 0.7]

# plt.figure(figsize=(8, 5))
# sns.set_style("white")
# sns.set(rc={"lines.linewidth": 0.2})
# plt.ylim(0, 1)
# plt.xlim(0, 1)
# plt.ylabel("Denisovan", fontsize=14)
# plt.xlabel("Neanderthal", fontsize=14)
# plt.title("Neanderthal Denisovan Introgressed \nRegions for overlaps", fontsize=16, weight='bold')

# sns.kdeplot(x=nean, y=deni, linestyles=":", n_levels=22, alpha=0.6)
# sns.kdeplot(x=nean, y=deni, cbar=True, cmap='Blues', n_levels=3, shade=True, thresh=0, cbar_kws={'label': 'Density', 'orientation': 'vertical'})

# plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Sample data for neanderthal and denisovan
nean = [0.1, 0.3, 0.4, 0.5, 0.6]
deni = [0.2, 0.3, 0.4, 0.5, 0.7]

plt.figure(figsize=(8, 5))
sns.set_style("white")
sns.set(rc={"lines.linewidth": 0.2})
plt.ylim(0, 1)
plt.xlim(0, 1)
plt.ylabel("Denisovan", fontsize=14)
plt.xlabel("Neanderthal", fontsize=14)
plt.title("Neanderthal Denisovan Introgressed \nRegions for overlaps", fontsize=16, weight='bold')

# Create the KDE plot with vmin and vmax set to 0 and 1 respectively
kde = sns.kdeplot(x=nean, y=deni, linestyles=":",  alpha=0.6, cmap='Blues')

# Add color bar
# plt.colorbar(kde.collections[0], label='Density')

plt.show()


In [None]:
ov_chrom_df = pd.DataFrame(columns=populations, index=populations)
ov_chrom_df_overlaps = pd.DataFrame(columns=populations, index=populations)
pop_set = set()
for pop in populations:
    ov_chrom_df.at[pop,pop] = 1
    k = [list(x) for x in population_pairs if pop in x]
    sum_pop = segment_lens[pop]
    for ele in k:
        ov_name = "-".join(ele)
        pair = ele.copy()
        pair.remove(pop)
        df_both = ov_dict[ov_name]
        df_both = df_both.drop_duplicates(subset=['CHROM', 'START', 'END'])
        sum_b = sum_seg(df_both)
        ov_chrom_df.at[pop, pair[0]] = sum_b/sum_pop
        ov_chrom_df_overlaps[pop, pair[0]] = sum_b

In [None]:
df = ov_chrom_df.copy()
df

In [None]:
result_dict = {tuple(([row, col])): df.at[row, col] for row in df.index for col in df.columns}
# result_dict

In [None]:
keys_to_remove = [key for key, value in result_dict.items() if value == 1]
for key in keys_to_remove:
    del result_dict[key]
import matplotlib.pyplot as plt
x_values = []
y_values = []
labels = []


lst = ["Papuans", "PEL", "MXL", "CLM"]
for key, value in result_dict.items():
    if key[0] in lst and key[1] in lst:
        x_values.append(value)
        y_values.append(value)
        labels.append(f"{key[0][:3]}-{key[1][:3]}")


# Create the plot
plt.figure(figsize=(16, 12))  # Adjust figure size as needed
# plt.scatter(x_values, y_values)
# lst = 

# Add labels to data points
for labele, x, y in zip(labels, x_values, y_values):
    
    # plt.scatter(x, y, label=labele) #has legend
    plt.scatter(x, y)
    plt.annotate(labele, (x, y), textcoords="offset points", xytext=(10, 10), ha='center')

# Add legend
plt.legend()

# Add labels to axes
plt.xlabel('Sharing proportion')
plt.ylabel('Sharing proportion')
current_xticks = plt.xticks()[0]
# lst = [0.00025*x for x in range(12)]
# lst = [0.000125*x for x in range(24)]
plt.xticks()
plt.yticks()

# Show plot
plt.grid(True)
plt.title('Plot showcasing pairwise sharing proportion in regards to putatively introgressed segments')
plt.show()


In [None]:
keys_to_remove = [key for key, value in result_dict.items() if value == 1]
for key in keys_to_remove:
    del result_dict[key]
import matplotlib.pyplot as plt
x_values = []
# print(x_values)
# print(dlfhd)
y_values = []
labels = []


lst = ["Papuans", "PEL", "MXL", "CLM"]
for key, value in result_dict.items():
        if key[0] in lst and key[1] in lst:
            x_values.append(key[0])
            y_values.append(value)
            labels.append(f"{key[0][:3]}-{key[1][:3]}")
    


# Create the plot
plt.figure(figsize=(16, 12))  # Adjust figure size as needed
# # Add labels to data points
for labele, x, y in zip(labels, x_values, y_values):
    
    # plt.scatter(x, y, label=labele) #has legend
    plt.scatter(x, y)
    plt.annotate(labele, (x, y), textcoords="offset points", xytext=(10, 10), ha='center')

# Add legend
plt.legend()

# Add labels to axes
plt.xlabel('Populations')
plt.ylabel('Sharing proportion')
current_yticks = plt.yticks()[0]

# Find the minimum and maximum values of the original list
min_value = np.min(y_values) + 0.0005
max_value = np.max(y_values)+0.0005

# Generate a new list with twice the number of elements
new_list = np.linspace(min_value, max_value, len(current_yticks) * 2)
# lst = [0.00025*x for x in range(12)]
# lst = [0.000125*x for x in range(24)]
plt.xticks()
plt.yticks(new_list)

# Show plot
plt.grid(True)
plt.title('Plot showcasing pairwise sharing proportion in regards to putatively introgressed segments')
plt.show()


In [None]:
keys_to_remove = [key for key, value in result_dict.items() if value == 1]
for key in keys_to_remove:
    del result_dict[key]
import matplotlib.pyplot as plt
x_values = []
y_values = []
labels = []


lst = ["Papuans", "PEL", "MXL", "CLM"]
for key, value in result_dict.items():
    if key[0] in lst or key[1] in lst:
        x_values.append(value)
        y_values.append(value)
        labels.append(f"{key[0][:3]}-{key[1][:3]}")


# Create the plot
plt.figure(figsize=(16, 12))  # Adjust figure size as needed
# plt.scatter(x_values, y_values)
# lst = 

# Add labels to data points
for labele, x, y in zip(labels, x_values, y_values):
    
    # plt.scatter(x, y, label=labele) #has legend
    plt.scatter(x, y)
    plt.annotate(labele, (x, y), textcoords="offset points", xytext=(10, 10), ha='center')

# Add legend
plt.legend()

# Add labels to axes
plt.xlabel('Sharing proportion')
plt.ylabel('Sharing proportion')
current_xticks = plt.xticks()[0]
print(current_xticks)
# new_xticks = [tick * 2 for tick in current_xticks]
# Find the minimum and maximum values of the original list
min_value = np.min(x_values)
max_value = np.max(current_xticks)
new_xticks = np.linspace(min_value, max_value, len(original_list) * 2)
new_xtics = np.array([round(num, 2) for num in new_xticks])

print(new_xticks)
# print(dlfjdlj)
# lst = [0.00025*x for x in range(12)]
# lst = [0.000125*x for x in range(24)]
# Determine the range and step size for the x-axis tick marks

# Generate the tick marks
# x_ticks = [x_min + i * x_step for i in range(int((x_max - x_min) / x_step) + 1)]
# print(x_ticks)
plt.xticks(ticks=new_xticks)
plt.yticks()

# Show plot
plt.grid(True)
plt.title('Plot showcasing pairwise sharing proportion in regards to putatively introgressed segments')
plt.show()


In [None]:
keys_to_remove = [key for key, value in result_dict.items() if value == 1]
for key in keys_to_remove:
    del result_dict[key]
import matplotlib.pyplot as plt
x_values = []
# print(x_values)
# print(dlfhd)
y_values = []
labels = []


lst = ["Papuans", "PEL", "MXL", "CLM"]
for key, value in result_dict.items():
        x_values.append(key[0])
        y_values.append(value)
        labels.append(f"{key[0][:3]}-{key[1][:3]}")
    


# Create the plot
plt.figure(figsize=(16, 12))  # Adjust figure size as needed
# # Add labels to data points
for labele, x, y in zip(labels, x_values, y_values):
    
    # plt.scatter(x, y, label=labele) #has legend
    plt.scatter(x, y)
    plt.annotate(labele, (x, y), textcoords="offset points", xytext=(10, 10), ha='center')

# Add legend
plt.legend()

# Add labels to axes
plt.xlabel('Populations')
plt.ylabel('Sharing proportion')
current_xticks = plt.xticks()[0]

# Find the minimum and maximum values of the original list
min_value = np.min(current_xticks)
max_value = np.max(current_xticks)

# Generate a new list with twice the number of elements
new_list = np.linspace(min_value, max_value, len(original_list) * 2)
# lst = [0.00025*x for x in range(12)]
# lst = [0.000125*x for x in range(24)]
plt.xticks()
plt.yticks()

# Show plot
plt.grid(True)
plt.title('Plot showcasing pairwise sharing proportion in regards to putatively introgressed segments')
plt.show()


In [None]:
keys_to_remove = [key for key, value in result_dict.items() if value == 1]
for key in keys_to_remove:
    del result_dict[key]
import matplotlib.pyplot as plt
x_values = []
# print(x_values)
# print(dlfhd)
y_values = []
labels = []


lst = ["Papuans", "PEL", "MXL", "CLM"]
for key, value in result_dict.items():
        if key[0] in lst and key[1] in lst:
            x_values.append(key[0])
            y_values.append(value)
            labels.append(f"{key[0][:3]}-{key[1][:3]}")
    


# Create the plot
plt.figure(figsize=(16, 12))  # Adjust figure size as needed
# # Add labels to data points
for labele, x, y in zip(labels, x_values, y_values):
    
    # plt.scatter(x, y, label=labele) #has legend
    plt.scatter(x, y)
    plt.annotate(labele, (x, y), textcoords="offset points", xytext=(10, 10), ha='center')

# Add legend
plt.legend()

# Add labels to axes
plt.xlabel('Populations')
plt.ylabel('Sharing proportion')
current_yticks = plt.yticks()[0]

# Find the minimum and maximum values of the original list
min_value = np.min(y_values) + 0.0005
max_value = np.max(y_values)+0.0005

# Generate a new list with twice the number of elements
new_list = np.linspace(min_value, max_value, len(current_yticks) * 2)
# lst = [0.00025*x for x in range(12)]
# lst = [0.000125*x for x in range(24)]
plt.xticks()
plt.yticks(new_list)

# Show plot
plt.grid(True)
plt.title('Plot showcasing pairwise sharing proportion in regards to putatively introgressed segments')
plt.show()
