In [73]:
import os
import pandas as pd
from io import StringIO

In [74]:
def read_csv_with_errors(file_path):
        with open(file_path, 'r', encoding='utf-8', errors='replace') as file:
            return pd.read_csv(StringIO(file.read()), header=None)

def calculate_overlap(file_path_1, file_path_2):
    #print(file_path_1, file_path_2)
    data_1 = read_csv_with_errors(file_path_1)
    data_2 = read_csv_with_errors(file_path_2)
    
    # remove duplicate usernames
    unique_names_1 = set(data_1[0].str.strip())
    unique_names_2 = set(data_2[0].str.strip())
    
    # Jaccard Similarity
    overlap = unique_names_1.intersection(unique_names_2)
    overlap_size = len(overlap)
    union_size = len(unique_names_1.union(unique_names_2))
    normalized_overlap = overlap_size / union_size
    percent_overlap = round(normalized_overlap * 100, 5)
    
    return percent_overlap, overlap_size, union_size

In [75]:
def generate_df(folder_path):
    file_names = [f for f in os.listdir(folder_path) if f.endswith(".csv")]
    overlap_df = pd.DataFrame(index=file_names, columns=file_names)
    
    for i, file_1 in enumerate(file_names):
        for j, file_2 in enumerate(file_names):
            if i != j:
                file_path_1 = os.path.join(folder_path, file_1)
                file_path_2 = os.path.join(folder_path, file_2)
                
                overlap = calculate_overlap(file_path_1, file_path_2)
                
                overlap_df.at[file_1, file_2] = overlap
            else:
                overlap_df.at[file_1, file_2] = None

    # remove ".csv"
    overlap_df.index = [name.replace('.csv', '') for name in overlap_df.index]
    overlap_df.columns = [name.replace('.csv', '') for name in overlap_df.columns]
    
    return overlap_df

In [76]:
df1 = generate_df("Channels")

In [77]:
df1

Unnamed: 0,AltonBrown,AvoidingthePuddle,CalebCity,CGPGrey,IHincognitoMode,illusorywall,InternetHistorian,kiwami-japan,Malleo,NileRed,RedLetterMedia,sanago,SecondWindGroup,TechnologyConnections,theslowmoguys,tracklib,TYLER1LOL,videogamedunkey,Vsauce,ZullietheWitch
AltonBrown,,"(0.0, 0, 5288)","(0.03606, 15, 41597)","(0.12375, 83, 67072)","(0.05398, 46, 85213)","(0.04985, 8, 16048)","(0.04521, 50, 110591)","(0.03312, 38, 114717)","(0.02733, 2, 7317)","(0.04462, 51, 114304)","(0.0666, 144, 216219)","(0.01255, 4, 31884)","(0.12686, 8, 6306)","(0.27676, 197, 71180)","(0.17527, 86, 49067)","(0.0197, 1, 5077)","(0.01362, 1, 7340)","(0.05348, 32, 59836)","(0.05553, 33, 59427)","(0.07448, 14, 18798)"
AvoidingthePuddle,"(0.0, 0, 5288)",,"(0.07662, 30, 39152)","(0.00927, 6, 64719)","(0.05073, 42, 82787)","(0.18381, 25, 13601)","(0.03051, 33, 108178)","(0.00801, 9, 112316)","(0.04092, 2, 4887)","(0.00625, 7, 111918)","(0.07251, 155, 213778)","(0.00339, 1, 29457)","(0.0773, 3, 3881)","(0.02031, 14, 68933)","(0.01498, 7, 46716)","(0.0, 0, 2648)","(0.12232, 6, 4905)","(0.15519, 89, 57349)","(0.01929, 11, 57019)","(0.28159, 46, 16336)"
CalebCity,"(0.03606, 15, 41597)","(0.07662, 30, 39152)",,"(0.41638, 419, 100630)","(0.69296, 820, 118333)","(0.38385, 191, 49759)","(0.62519, 898, 143637)","(0.27793, 412, 148237)","(0.10201, 42, 41171)","(0.6757, 995, 147254)","(0.20742, 518, 249739)","(0.1111, 73, 65709)","(0.0946, 38, 40170)","(0.31446, 330, 104941)","(0.45238, 374, 82673)","(0.04878, 19, 38953)","(0.12383, 51, 41184)","(1.1882, 1101, 92661)","(1.02372, 946, 92408)","(0.64543, 338, 52368)"
CGPGrey,"(0.12375, 83, 67072)","(0.00927, 6, 64719)","(0.41638, 419, 100630)",,"(2.01641, 2860, 141836)","(0.37495, 282, 75211)","(1.78643, 2985, 167093)","(0.43242, 750, 173442)","(0.20113, 134, 66622)","(0.88584, 1526, 172266)","(0.72641, 1989, 273811)","(0.06355, 58, 91267)","(0.21032, 138, 65613)","(2.62095, 3341, 127473)","(1.04967, 1128, 107462)","(0.0093, 6, 64509)","(0.02696, 18, 66760)","(0.70737, 838, 118467)","(1.02214, 1203, 117694)","(0.29866, 233, 78016)"
IHincognitoMode,"(0.05398, 46, 85213)","(0.05073, 42, 82787)","(0.69296, 820, 118333)","(2.01641, 2860, 141836)",,"(0.77956, 724, 92873)","(11.24761, 19026, 169156)","(0.72863, 1391, 190905)","(0.16998, 144, 84716)","(0.82649, 1573, 190323)","(2.54671, 7299, 286605)","(0.09879, 108, 109321)","(0.19955, 167, 83688)","(1.31303, 1930, 146988)","(0.77554, 975, 125719)","(0.01816, 15, 82604)","(0.07664, 65, 84817)","(2.22135, 2986, 134423)","(0.93716, 1272, 135729)","(0.76236, 729, 95624)"
illusorywall,"(0.04985, 8, 16048)","(0.18381, 25, 13601)","(0.38385, 191, 49759)","(0.37495, 282, 75211)","(0.77956, 724, 92873)",,"(0.62755, 742, 118237)","(0.14807, 182, 122911)","(0.37825, 59, 15598)","(0.26068, 319, 122374)","(0.44119, 987, 223714)","(0.0373, 15, 40211)","(0.43183, 63, 14589)","(0.52079, 413, 79302)","(0.31058, 178, 57313)","(0.02237, 3, 13413)","(0.04467, 7, 15672)","(0.886, 599, 67607)","(0.35079, 237, 67561)","(5.81083, 1491, 25659)"
InternetHistorian,"(0.04521, 50, 110591)","(0.03051, 33, 108178)","(0.62519, 898, 143637)","(1.78643, 2985, 167093)","(11.24761, 19026, 169156)","(0.62755, 742, 118237)",,"(0.747, 1614, 216064)","(0.14353, 158, 110084)","(0.94544, 2035, 215243)","(2.43704, 7596, 311690)","(0.10024, 135, 134676)","(0.14026, 153, 109084)","(1.28951, 2219, 172081)","(0.73126, 1104, 150972)","(0.01852, 20, 107981)","(0.06806, 75, 110189)","(2.10493, 3356, 159435)","(0.88407, 1423, 160960)","(0.65402, 791, 120944)"
kiwami-japan,"(0.03312, 38, 114717)","(0.00801, 9, 112316)","(0.27793, 412, 148237)","(0.43242, 750, 173442)","(0.72863, 1391, 190905)","(0.14807, 182, 122911)","(0.747, 1614, 216064)",,"(0.056, 64, 114292)","(0.67162, 1477, 219915)","(0.26881, 867, 322533)","(0.22581, 313, 138612)","(0.02206, 25, 113326)","(0.42949, 763, 177651)","(0.42371, 659, 155531)","(0.00446, 5, 112110)","(0.01224, 14, 114364)","(0.27456, 457, 166448)","(0.3653, 606, 165891)","(0.19905, 250, 125599)"
Malleo,"(0.02733, 2, 7317)","(0.04092, 2, 4887)","(0.10201, 42, 41171)","(0.20113, 134, 66622)","(0.16998, 144, 84716)","(0.37825, 59, 15598)","(0.14353, 158, 110084)","(0.056, 64, 114292)",,"(0.07465, 85, 113871)","(0.04772, 103, 215861)","(0.0, 0, 31489)","(0.0846, 5, 5910)","(0.20471, 145, 70833)","(0.08006, 39, 48715)","(0.04276, 2, 4677)","(0.04323, 3, 6939)","(0.16, 95, 59374)","(0.11018, 65, 58996)","(0.13051, 24, 18389)"
NileRed,"(0.04462, 51, 114304)","(0.00625, 7, 111918)","(0.6757, 995, 147254)","(0.88584, 1526, 172266)","(0.82649, 1573, 190323)","(0.26068, 319, 122374)","(0.94544, 2035, 215243)","(0.67162, 1477, 219915)","(0.07465, 85, 113871)",,"(0.36261, 1167, 321833)","(0.28306, 391, 138134)","(0.02834, 32, 112919)","(1.32451, 2327, 175687)","(1.49252, 2291, 153499)","(0.01253, 14, 111701)","(0.01667, 19, 113959)","(0.48703, 807, 165698)","(2.41838, 3922, 162175)","(0.25173, 315, 125134)"


In [83]:
def get_sorted_values(df):
    processed_pairs = set()
    values_with_names = []
    
    for row in range(df.shape[0]):
        for col in range(df.shape[1]):
            # don't repeat B+A when A+B is already processed
            if (df.index[row], df.columns[col]) not in processed_pairs and \
               (df.columns[col], df.index[row]) not in processed_pairs:
                
                value = df.iat[row, col]
                if pd.notna(value):
                    values_with_names.append((value, df.index[row], df.columns[col]))
                    processed_pairs.add((df.index[row], df.columns[col]))

    # descending order
    sorted_values_with_names = sorted(values_with_names, key=lambda x: x[0], reverse=True)
    return sorted_values_with_names

In [85]:
df1_sorted = get_sorted_values(df1)
df1_sorted[:10]

[((11.24761, 19026, 169156), 'IHincognitoMode', 'InternetHistorian'),
 ((5.81083, 1491, 25659), 'illusorywall', 'ZullietheWitch'),
 ((2.65715, 2920, 109892), 'TechnologyConnections', 'theslowmoguys'),
 ((2.62095, 3341, 127473), 'CGPGrey', 'TechnologyConnections'),
 ((2.54671, 7299, 286605), 'IHincognitoMode', 'RedLetterMedia'),
 ((2.43704, 7596, 311690), 'InternetHistorian', 'RedLetterMedia'),
 ((2.41838, 3922, 162175), 'NileRed', 'Vsauce'),
 ((2.22135, 2986, 134423), 'IHincognitoMode', 'videogamedunkey'),
 ((2.10493, 3356, 159435), 'InternetHistorian', 'videogamedunkey'),
 ((2.01641, 2860, 141836), 'CGPGrey', 'IHincognitoMode')]

Jaccard Similarity\
∣A∪B∣ / ∣A∩B∣\
∣A∩B∣ = number of common names\
∣A∪B∣ = total number of unique names across both lists\