In [1]:
import pandas as pd
import requests
import pyarrow.feather as feather

In [2]:
articles_df = pd.read_feather('Data/dataframes/article_dataframe_embedding_distance.feather')
l_paths_df = pd.read_feather('Data/dataframes/length_filt_finished_paths.feather')
t_paths_df = pd.read_feather('Data/dataframes/time_filt_finished_paths.feather')

In [3]:
articles_df['embeddings'].shape

(4604,)

In [4]:
# check that all articles have embeddings
articles_df['embeddings'].value_counts().sum() == articles_df.shape[0]

np.True_

In [5]:
l_paths_df

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,finished,failure_reason,start_article,target_article,identifier,full_path,simplified_path,distance,full_path_length,simplified_path_length
0,0935d79e398bbc10,1323114272,203,Theatre;India;Bay_of_Bengal;Indian_Ocean;Afric...,,True,,Theatre,Zebra,38072,"[Theatre, India, Bay_of_Bengal, Indian_Ocean, ...","[Theatre, India, Bay_of_Bengal, Indian_Ocean, ...",3.0,8,7
1,60c910147d51bc1a,1351315069,187,Brain;Computer_science;Internet;World_Wide_Web...,3.0,True,,Brain,Telephone,6500,"[Brain, Computer_science, Internet, World_Wide...","[Brain, Computer_science, Internet, World_Wide...",3.0,4,4
2,3a11233c52f824f8,1336283594,178,Family;United_States;Judaism;Olive_oil;Tree;Ap...,,True,,Family,Cider,13911,"[Family, United_States, Judaism, Olive_oil, Tr...","[Family, United_States, Judaism, Olive_oil, Tr...",3.0,6,6
3,71773902171ac6b7,1256870474,25,Luminiferous_aether;Special_relativity;Time;Clock,1.0,True,,Luminiferous_aether,Clock,23616,"[Luminiferous_aether, Special_relativity, Time...","[Luminiferous_aether, Special_relativity, Time...",3.0,3,3
4,77d69c7906320c92,1322615091,41,Corn_oil;Maize;Southern_Africa;Mayotte,2.0,True,,Corn_oil,Mayotte,10144,"[Corn_oil, Maize, Southern_Africa, Mayotte]","[Corn_oil, Maize, Southern_Africa, Mayotte]",3.0,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21641,5e5b3e43164d9fb9,1366728083,423,Fusarium;Cornea;Human;United_Nations;United_St...,3.0,True,,Fusarium,Tropical_Storm_Delta_(2005),15281,"[Fusarium, Cornea, Human, United_Nations, Unit...","[Fusarium, Cornea, Human, United_Nations, Unit...",6.0,8,8
21642,470fc8241582c7eb,1249681545,525,Scheme_programming_language;Functional_program...,5.0,True,,Scheme_programming_language,Tasmanian_Devil,33670,"[Scheme_programming_language, Functional_progr...","[Scheme_programming_language, Functional_progr...",6.0,13,10
21643,1e68f2ea6fb133d3,1357686675,186,"Horatio_Nelson,_1st_Viscount_Nelson;England;St...",3.0,True,,"Horatio_Nelson,_1st_Viscount_Nelson",Upsilon_Andromedae_d,18649,"[Horatio_Nelson,_1st_Viscount_Nelson, England,...","[Horatio_Nelson,_1st_Viscount_Nelson, England,...",6.0,6,6
21644,473d6ac602c2b198,1318968881,110,Scheme_programming_language;Functional_program...,,True,,Scheme_programming_language,Effects_of_global_warming,33669,"[Scheme_programming_language, Functional_progr...","[Scheme_programming_language, Functional_progr...",6.0,6,6


In [6]:
articles_df.head(1)

Unnamed: 0,article,article_unrendered_unicode,category,linkSource,linkTarget,distances,plain_text,incoming_links,num_hyperlinks,num_characters,hyperlink_density,embeddings,average_outgoing_article_title_cosine_distance
0,Áedán_mac_Gabráin,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,[subject.History.British_History.British_Histo...,Áedán_mac_Gabráin,"[Bede, Columba, Dál_Riata, Great_Britain, Irel...","{'10th_century': 3.0, '11th_century': 3.0, '12...",#copyright\n\nÁedán mac Gabráin\n\n2007 Sch...,0,11,11612,0.000947,"[-0.1292391121387481, 0.0236219353973865, -0.0...",0.785123


In [7]:
# 1. extract the simplified path and start article => new df of 2 columns 
# 2. a function that iterates over the simplified paths and extracts the corresponding vector embeddings from the articles_df
# 3. apply the function
# 4. compute the cosine similarty between all the articles and the target article (add a new column to the df (list of cosine similarities))
# 5. then iterate over the df to average the cosine similarities for each article

# what this can tell us? are the articles that get high scores in cosine similarity to the target on average the ones that have high scores?
# if so this might indicate theat these articles tend to appear in specific paths, or that the target articles favor certain topics...

In [8]:
# 1. 

df = l_paths_df[['simplified_path']]

df


Unnamed: 0,simplified_path
0,"[Theatre, India, Bay_of_Bengal, Indian_Ocean, ..."
1,"[Brain, Computer_science, Internet, World_Wide..."
2,"[Family, United_States, Judaism, Olive_oil, Tr..."
3,"[Luminiferous_aether, Special_relativity, Time..."
4,"[Corn_oil, Maize, Southern_Africa, Mayotte]"
...,...
21641,"[Fusarium, Cornea, Human, United_Nations, Unit..."
21642,"[Scheme_programming_language, Functional_progr..."
21643,"[Horatio_Nelson,_1st_Viscount_Nelson, England,..."
21644,"[Scheme_programming_language, Functional_progr..."


In [9]:
# 2. a function that iterates over the simplified paths and extracts the corresponding vector embeddings from the articles_df

def get_embeddings(simplified_path, articles_df):
    embeddings = articles_df[articles_df['article'].isin(simplified_path)]['embeddings'].tolist()
    
    # Only return embeddings if the number of embeddings matches the length of the simplified_path
    if len(embeddings) == len(simplified_path):
        return embeddings
    else:
        return None  # If the lengths don't match, return None (indicating invalid row)

# Apply the function to the DataFrame
df['embeddings'] = df['simplified_path'].apply(lambda x: get_embeddings(x, articles_df))

# Drop rows where embeddings are None (invalid rows)
df = df.dropna(subset=['embeddings'])

# Reset index after dropping rows (optional, depending on your preference)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['embeddings'] = df['simplified_path'].apply(lambda x: get_embeddings(x, articles_df))


In [10]:
df.shape

(21535, 2)

In [11]:
l_paths_df.shape

(21646, 15)

In [12]:
# the target article is the last element in the simplified path
df = df.copy()
# Extract the target article and target embeddings
df['target_article'] = df['simplified_path'].apply(lambda x: x[-1])
df['target_embedding'] = df['embeddings'].apply(lambda x: x[-1])

# Remove the last element from simplified_path and embeddings
df['simplified_path'] = df['simplified_path'].apply(lambda x: x[1:-1])
df['embeddings'] = df['embeddings'].apply(lambda x: x[1:-1])

In [13]:
# get the shape of the first element of the embeddings column

df.head(5)

Unnamed: 0,simplified_path,embeddings,target_article,target_embedding
0,"[India, Bay_of_Bengal, Indian_Ocean, Africa, G...","[[-0.0125789958983659, 0.0561332367360591, -0....",Zebra,"[-0.1285928189754486, 0.0847383961081504, -0.0..."
1,"[Computer_science, Internet, World_Wide_Web]","[[-0.0637916252017021, -0.0175681598484516, -0...",Telephone,"[-0.0529951564967632, -0.06702421605587, -0.01..."
2,"[United_States, Judaism, Olive_oil, Tree, Apple]","[[-0.0554433651268482, 0.0141023732721805, -0....",Cider,"[0.0643062964081764, -0.0023302482441067, 0.00..."
3,"[Special_relativity, Time]","[[-0.0204615797847509, -0.0378306880593299, 0....",Clock,"[-0.074506863951683, 0.054837942123413, -0.021..."
4,"[Maize, Southern_Africa]","[[-0.0457388870418071, 0.0483880490064621, -0....",Mayotte,"[-0.0358180664479732, 0.0611802525818347, -0.0..."


In [14]:
df[~df.apply(lambda row: len(row['embeddings']) == len(row['simplified_path']), axis=1)]

Unnamed: 0,simplified_path,embeddings,target_article,target_embedding


In [15]:
# 3. compute the cosine similarty between all the articles and the target article (add a new column to the df (list of cosine similarities))

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

df2 = df.copy()

def get_cosine_similarities(target_embedding, embeddings):
    cos_dist_list = []
    for embedding in embeddings:
        cos_dist_list.append(cosine_similarity(np.array(target_embedding).reshape(1, -1), np.array(embedding).reshape(1, -1)))
    return cos_dist_list

df2['cosine_similarities'] = df2.apply(lambda x: get_cosine_similarities(x['target_embedding'], x['embeddings']), axis=1)

In [16]:
# 5. then iterate over the df to average the cosine similarities for each article
def calculate_avg_cosine_dist(df, count_cutoff=30, scaling=None):

    # Initialize an empty DataFrame to store results
    avg_cosine_dist_df = pd.DataFrame(columns=['article', 'n_appearances', 'avg_cosine_dist'])
    avg_cosine_dist_df.set_index('article', inplace=True)

    # Iterate through each row to calculate weights
    for _, row in df.iterrows():
        cosine_similarities = row['cosine_similarities']
        simplified_path = row['simplified_path']

        for idx, article in enumerate(simplified_path):
            if article not in avg_cosine_dist_df.index:
                avg_cosine_dist_df.loc[article] = [0, 0.0]

            # Update counts and weighted sums
            avg_cosine_dist_df.at[article, 'n_appearances'] += 1
            avg_cosine_dist_df.at[article, 'avg_cosine_dist'] += cosine_similarities[idx]

    # Calculate the weighted average by dividing weighted sum by counts
    avg_cosine_dist_df['avg_cosine_dist'] = avg_cosine_dist_df['avg_cosine_dist'] / avg_cosine_dist_df['n_appearances']

    # Filter out articles that appear less than the cutoff
    avg_cosine_dist_df = avg_cosine_dist_df[avg_cosine_dist_df['n_appearances'] >= count_cutoff]

    print(f"Number of unique articles after weighting: {avg_cosine_dist_df.shape[0]}")

    return avg_cosine_dist_df#.reset_index()

avg_cosine_dist_df = calculate_avg_cosine_dist(df2, count_cutoff=30)

Number of unique articles after weighting: 496


In [17]:
avg_cosine_dist_df.sort_values('avg_cosine_dist', ascending=False)

Unnamed: 0_level_0,n_appearances,avg_cosine_dist
article,Unnamed: 1_level_1,Unnamed: 2_level_1
Whale_shark,78.0,0.561491
Shark,94.0,0.523529
Whale,66.0,0.462966
Western_Roman_Empire,30.0,0.459886
Sputnik_1,42.0,0.452991
...,...,...
Space_Shuttle_Columbia,41.0,0.214021
Calcium,41.0,0.212621
17th_century,52.0,0.211912
Yellowstone_National_Park,33.0,0.210304


In [18]:
# suposedly faster alternative

def calculate_avg_cosine_dist(df, count_cutoff=30, scaling=None):
    from collections import defaultdict

    # Use dictionaries to aggregate counts and cosine sums
    article_counts = defaultdict(int)
    article_cosine_sums = defaultdict(float)

    # Iterate through each row to calculate weights
    for _, row in df.iterrows():
        cosine_similarities = row['cosine_similarities']
        simplified_path = row['simplified_path']

        for idx, article in enumerate(simplified_path):
            article_counts[article] += 1
            article_cosine_sums[article] += cosine_similarities[idx]

    # Create a DataFrame from the aggregated data
    avg_cosine_dist_df = pd.DataFrame({
        'article': list(article_counts.keys()),
        'n_appearances': list(article_counts.values()),
        'avg_cosine_dist': [article_cosine_sums[article] / article_counts[article] 
                            for article in article_counts.keys()]
    })

    # Filter out articles that appear less than the cutoff
    avg_cosine_dist_df = avg_cosine_dist_df[avg_cosine_dist_df['n_appearances'] >= count_cutoff]

    print(f"Number of unique articles after weighting: {avg_cosine_dist_df.shape[0]}")

    return avg_cosine_dist_df.reset_index(drop=True)


In [19]:
# a function that from an article name iterates over the paths_df and return all rows with the article name in the simplified path
def get_article_paths(article_name, paths_df):
    return paths_df[paths_df['simplified_path'].apply(lambda x: article_name in x)]

In [20]:
get_article_paths('Whale_shark', paths_df=l_paths_df).head(50)

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating,finished,failure_reason,start_article,target_article,identifier,full_path,simplified_path,distance,full_path_length,simplified_path_length
18,098c807976ebd156,1306774721,41,Blacktip_reef_shark;Shark;Whale_shark;Mexico;I...,,True,,Blacktip_reef_shark,Economics,5887,"[Blacktip_reef_shark, Shark, Whale_shark, Mexi...","[Blacktip_reef_shark, Shark, Whale_shark, Mexi...",3.0,5,5
95,0c013a70550e6011,1257357145,199,Computer;Great_Britain;Ice_age;Greenland;Arcti...,3.0,True,,Computer,Whale,9798,"[Computer, Great_Britain, Ice_age, Greenland, ...","[Computer, Great_Britain, Ice_age, Greenland, ...",3.0,9,9
104,5f85cf827a29ac5a,1352760115,161,Great_Salt_Lake;Fish;Whale_shark;Shark;Blue_shark,,True,,Great_Salt_Lake,Blue_shark,16899,"[Great_Salt_Lake, Fish, Whale_shark, Shark, Bl...","[Great_Salt_Lake, Fish, Whale_shark, Shark, Bl...",3.0,4,4
235,3c4bf61c4a447176,1345664543,10,Bird;Fish;Whale_shark;Basking_shark;Great_whit...,,True,,Bird,Great_white_shark,5634,"[Bird, Fish, Whale_shark, Basking_shark, Great...","[Bird, Fish, Whale_shark, Basking_shark, Great...",3.0,4,4
254,25ad32b66245e04b,1388605778,46,Flood;Sea;Ocean;Krill;Whale_shark;Shark;Jaws_(...,,True,,Flood,Jaws_(film),14534,"[Flood, Sea, Ocean, Krill, Whale_shark, Shark,...","[Flood, Sea, Ocean, Krill, Whale_shark, Shark,...",3.0,6,6
683,567951c61a5d5c91,1254226509,148,Pope_Pius_XII;Europe;Asia;Japan;Pacific_Ocean;...,,True,,Pope_Pius_XII,Whale,30739,"[Pope_Pius_XII, Europe, Asia, Japan, Pacific_O...","[Pope_Pius_XII, Europe, Asia, Japan, Fishing, ...",3.0,8,7
721,5164c8e415d73ec0,1349900154,72,Bird;Vertebrate;Whale;Dolphin;Fish;Whale_shark...,,True,,Bird,Great_white_shark,5634,"[Bird, Vertebrate, Whale, Dolphin, Fish, Whale...","[Bird, Vertebrate, Whale, Dolphin, Fish, Whale...",3.0,7,7
854,49ff80356d88b407,1351112498,109,Bird;Fish;Whale_shark;Shark;Great_white_shark,2.0,True,,Bird,Great_white_shark,5634,"[Bird, Fish, Whale_shark, Shark, Great_white_s...","[Bird, Fish, Whale_shark, Shark, Great_white_s...",3.0,4,4
1168,398239e42f686267,1248803537,88,Lake_Victoria;River;Ocean;Fish;Whale_shark;Sha...,1.0,True,,Lake_Victoria,Basking_shark,22249,"[Lake_Victoria, River, Ocean, Fish, Whale_shar...","[Lake_Victoria, River, Ocean, Fish, Whale_shar...",3.0,6,6
1186,4e5bc9b510f05358,1249094043,95,Styracosaurus;Fossil;Vertebrate;Fish;Whale_sha...,2.0,True,,Styracosaurus,Cookiecutter_shark,36260,"[Styracosaurus, Fossil, Vertebrate, Fish, Whal...","[Styracosaurus, Fossil, Vertebrate, Fish, Whal...",3.0,6,6
