In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import urllib.parse

In [4]:
import os, sys

# HTLM files URLs in a list 
wp_path = "/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wpcd/wp/" # to adapt
directories = os.listdir(wp_path)
html_list = []
for dir in directories:
    local_path = wp_path + dir
    local_dir = os.listdir(local_path)
    for link in local_dir:
        if link[-3:]=='htm':
            html_list.append(dir+'/'+link)
html_list[:10]

['r/Royal_Marines.htm',
 'r/Recycling.htm',
 'r/Retreat_of_glaciers_since_1850.htm',
 'r/Remember_Girls_Like_Rajpura.htm',
 'r/Rutherfordium.htm',
 'r/Rule_of_St_Benedict.htm',
 'r/Red_dwarf.htm',
 'r/Romeo_and_Juliet.htm',
 'r/Rudyard_Kipling.htm',
 'r/Race.htm']

The aim of this specific game strategy analysis is to test whether Wikispeedia players have clikability preferences on the basis of the hyperlinks' positioning throughout the article text. 
In particular:
1. we wil find the ordinal number of the paragraph each hyperlink in an article shows up in and divide it by the total number of paragraphs of the article, to determine its positioning; 
2. we will determine if each hyperlink in an article shows up in an image captions.
We will then investigate if there is a correlation between the clickability of the hyperlinks and their location, using the proper statistical tests. 

It is therefore essential to extract from the data the number of clicks for each hyperlink.

In [5]:
# read files 
folder='/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wikispeedia_paths-and-graph/'

paths_finished='paths_finished.tsv'
paths_unfinished='paths_unfinished.tsv'
articles='articles.tsv'

paths_finished=pd.read_csv(folder+paths_finished, sep='\t', skiprows=15, header=None, names=['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'])
paths_unfinished=pd.read_csv(folder+paths_unfinished, sep='\t', skiprows=16, header=None, names=['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'target', 'type'])
articles=pd.read_csv(folder+articles, sep='\t', skiprows=12, header=None, names=['article'])

The dataframe `paths_finished` contains the paths chosen by the players who could reach the target article.

In [6]:
paths_finished.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0


The dataframe `paths_unfinished` contains the paths chosen by the players who could not reach the target article.

In [7]:
paths_unfinished.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,target,type
0,2426091a53125110,1297054935,1804,Obi-Wan_Kenobi,Microsoft,timeout
1,26141fd878806294,1297055651,1805,Julius_Caesar,Caracas,timeout
2,2b015fb8181c48f2,1297090819,1818,Malawi;Democracy;Alexander_the_Great,First_Crusade,timeout
3,53a53bc244e08a6a,1297094761,49,Paraguay,Mount_St._Helens,restart
4,53a53bc244e08a6a,1297099105,1808,Paraguay;Bolivia,Mount_St._Helens,timeout


The dataframe `articles` lists all the clickable articles/links.

In [8]:
# encode and filter link names
def filter_link_list(links_list):
    new_links_list=[]
    for link in links_list:
        link=urllib.parse.unquote(link)
        link=link.lower()
        link=link.replace('_', ' ')
        link=link.strip()
        new_links_list.append(link)
    return new_links_list

articles.article=filter_link_list(articles.article)
articles.head(10)

Unnamed: 0,article
0,áedán mac gabráin
1,åland
2,édouard manet
3,éire
4,óengus i of the picts
5,€2 commemorative coins
6,10th century
7,11th century
8,12th century
9,13th century


In [9]:
print('The total number of articles comprising the Wikispeedia library is {0}.'.format(len(articles)))

The total number of articles comprising the Wikispeedia library is 4604.


The dataframe `clicked_links_filtered` contains the clicked links troughout the paths.

In [11]:
clicked_links=[]
for x in paths_finished['path']: 
    path=(x.split(';')) # get list of links along the path string 
    clicked_links.append(path)

clicked_links_filtered=[] # consistent with other dfs
for path in clicked_links:
    path_filtered=filter_link_list(path)
    if '<' in path: 
        path.remove('<') # remove backclicks
    clicked_links_filtered.append(path_filtered)

clicked_links_filtered[:5]

[['14th century',
  '15th century',
  '16th century',
  'pacific ocean',
  'atlantic ocean',
  'accra',
  'africa',
  'atlantic slave trade',
  'african slave trade'],
 ['14th century',
  'europe',
  'africa',
  'atlantic slave trade',
  'african slave trade'],
 ['14th century',
  'niger',
  'nigeria',
  'british empire',
  'slavery',
  'africa',
  'atlantic slave trade',
  'african slave trade'],
 ['14th century', 'renaissance', 'ancient greece', 'greece'],
 ['14th century',
  'italy',
  'roman catholic church',
  'hiv',
  'ronald reagan',
  'president of the united states',
  'john f. kennedy']]

The dataframe `links_freq_df` contains the links' frequency of clicks.

In [14]:
links_name=[]
links_freq=[]

i=0
for article in articles['article']: # count the number of times each article shows up in paths
    count=0
    links_name.append(article)
    for i in range(len(clicked_links_filtered)):
        if article in clicked_links_filtered[i]:
            count+=1 
    links_freq.append(count)

links_name=pd.Series(links_name)
links_freq=pd.Series(links_freq)
links_freq_df=pd.concat([links_name, links_freq], axis=1, names=['link', '#_clicks'])
links_freq_df.columns=['link', '#_clicks']
links_freq_df.loc[links_freq_df['#_clicks']!=0].head()

Unnamed: 0,link,#_clicks
1,åland,2
2,édouard manet,2
3,éire,3
5,€2 commemorative coins,1
6,10th century,109


In [36]:
clicks_tot=links_freq_df['#_clicks'].sum()
print('The total number of clicked hyperlinks is {0}.'.format(clicks_tot))

The total number of clicked hyperlinks is 321717.


Next step of the descriptive analysis process is the HTML files parsing aimed at determining the location of hyperlinks. 

In [17]:
# parse html files 

from bs4 import BeautifulSoup

soups=[]
failed_html_list = []
for html in html_list[:]:
    f=open(wp_path+html, 'r')
    try:
        soup = BeautifulSoup(f, 'lxml')
        soups.append(soup)
    except Exception:
        failed_html_list.append(html)

In [18]:
print('{0} files were not parsable. However, they are not reachable nor it is possible to click \
any hyperlink in those html pages.'.format(len(failed_html_list)))

53 files were not parsable. However, they are not reachable nor it is possible to click any hyperlink in those html pages.


`link_positioning_dfs` is a list of dataframes (`link_positioning_df`) containing hyperlinks showing up in one html file, the ordinal number of the paragraph they belong to and their relative positioning troughout the article text. 

In [22]:
# HTML tag for hyperlinks is <a href= >
# the function finds all <a href= > tags throughout the article html file

def find_links(soup): 
    paragraphs = soup.find_all('p') 
    links=[]
    i=0
    for p in paragraphs:
        i+=1
        all_links=p.find_all('a')
        for link in all_links:
            if 'href' in link.attrs:
                link_title=link.text
                link_paragraph=i
                links.append([link_title, link_paragraph])
                
    link_positioning_df=pd.DataFrame(links, columns=['link', '#_paragraph']).drop_duplicates()
    link_positioning_df['positioning']=round(link_positioning_df['#_paragraph']/len(paragraphs), 2)
    link_positioning_df['link']=filter_link_list(link_positioning_df['link'])
    return link_positioning_df

In [54]:
# apply the function to all html files

link_positioning_dfs=[]
for soup in soups:
    link_positioning_dfs.append(find_links(soup))

link_positioning_dfs[:5]

[                              link  #_paragraph  positioning
 0                       royal navy            1         0.01
 1                   united kingdom            1         0.01
 2                         cold war           18         0.19
 3                     sierra leone           20         0.21
 4                         dartmoor           24         0.25
 5                     world war ii           29         0.30
 6                         plymouth           34         0.35
 7                         scotland           34         0.35
 8                        gibraltar           43         0.44
 9                             cuba           44         0.45
 10               george washington           44         0.45
 11                      royal navy           45         0.46
 13                      portsmouth           46         0.47
 14                        plymouth           46         0.47
 15      american revolutionary war           46         0.47
 16     

In [23]:
# example with 2 files

URL_1 = wp_path+'a/A_cappella.htm'
URL_2 = wp_path+'m/Music.htm'

f1=open(URL_1, 'r')
soup1 = BeautifulSoup(f1, 'lxml')

f2=open(URL_2, 'r')
soup2 = BeautifulSoup(f2, 'lxml')

link_positioning_df_1=find_links(soup1).drop(labels='#_paragraph', axis=1)
link_positioning_df_2=find_links(soup2).drop(labels='#_paragraph', axis=1)

# merging the 2 dataframes

merged_dfs=link_positioning_df_1.merge(link_positioning_df_2, how='outer', left_on='link', right_on='link', )
merged_dfs

Unnamed: 0,link,positioning_x,positioning_y
0,music,0.04,
1,instrumental,0.04,
2,renaissance,0.04,
3,renaissance,0.12,
4,gregorian chant,0.12,
5,amish,0.21,
6,eastern orthodox,0.21,
7,jewish,0.33,
8,beatles,0.75,
9,hindi,0.88,


In [None]:
# merge all the link_positioning_df in link_positioning_dfs list by hyperlink in order to apply a function 
# that determines the "average" positioning of the hyperlinks

for df in link_positioning_dfs:
    try:
        df.drop(labels='#_paragraph', axis=1, inplace=True)  # remove not needed column
    except:
        pass    

from functools import reduce # perform multiple merge
links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
links_pos_dfs_merged.head()

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  left,right: pd.merge(left,right,on=['link'],how='outer'), link_positioning_dfs)
  links_pos_dfs_merged = reduce(lambda  

In [27]:
# function that determines the most frequent positioning of each hyperlink in the set of all links

def find_positioning(array):
    bin_labels=['top', 'center', 'bottom'] # positioning is classified as top, center or bottom of the article
    bins=[0, 0.33, 0.66, 1] 
    cut_positioning=pd.cut(array, bins, labels=bin_labels)
    positioning_frequency=pd.value_counts(cut_positioning) # frequency of the bins
    most=positioning_frequency.apply('max') # bin with highest frequency
    if len(positioning_frequency[positioning_frequency==most])>1:
        return np.random.choice(positioning_frequency[positioning_frequency==most].index.values.astype(str)) #account for same frequency
    else:
        return positioning_frequency[positioning_frequency==most].index.values.astype(str).item()

The dataframe `most_freq_positioning_df` contains the hyperlinks "average" positioning.

In [32]:
# example with 2 files

most_freq_positioning_2=merged_dfs.groupby(by=merged_dfs['link']).apply(lambda x: find_positioning(np.array(x.values).flatten()))
most_freq_positioning_df_2=pd.DataFrame(most_freq_positioning, columns=['most_freq_positioning'])
most_freq_positioning_df_2=most_freq_positioning_df_2.reset_index()
most_freq_positioning_df_2

Unnamed: 0,link,most_freq_positioning
0,aesthetic,top
1,amish,top
2,anthropology,bottom
3,art,top
4,artistic,top
5,beatles,bottom
6,beethoven,top
7,bluegrass,top
8,communicative,top
9,culture,top


In [29]:
# all html files

most_freq_positioning=links_pos_dfs_merged.set_index("link").groupby("link").apply(lambda x: find_positioning(x.values.flatten()))
most_freq_positioning_df=pd.DataFrame(most_freq_positioning, columns=['most_freq_positioning'])
most_freq_positioning_df

NameError: name 'links_pos_dfs_merged' is not defined

The dataframe `links_freq_pos` contains both the hyperlinks' most frequent positioning and their number of clicks.

In [34]:
# example with 2 files

links_freq_pos_2=links_freq_df.merge(most_freq_positioning_df_2, left_on='link', right_on='link')
links_freq_pos_2

Unnamed: 0,link,#_clicks,most_freq_positioning
0,amish,19,top
1,anthropology,142,bottom
2,art,589,top
3,culture,583,top
4,folk music,155,top
5,gregorian chant,34,top
6,hindi,61,bottom
7,india,1216,top
8,jazz,281,top
9,johann wolfgang goethe,16,top


In [None]:
# all html files

links_freq_pos=links_freq_df.merge(most_freq_positioning_df, left_on='link', right_on='link')
links_freq_pos

The dataframe `links_freq_pos_grouped_df` groups the hyperlinks click frequency by their positioning.

In [36]:
# example with 2 files

links_freq_pos_grouped_2=links_freq_pos['#_clicks'].groupby(links_freq_pos['most_freq_positioning']).sum()
links_freq_pos_grouped_df_2=pd.DataFrame(links_freq_pos_grouped_2)
links_freq_pos_grouped_df_2

Unnamed: 0_level_0,#_clicks
most_freq_positioning,Unnamed: 1_level_1
bottom,2575
center,180
top,4551


In [None]:
# all html files

links_freq_pos_grouped=links_freq_pos['#_clicks'].groupby(links_freq_pos['most_freq_positioning']).sum()
links_freq_pos_grouped_df=pd.DataFrame(links_freq_pos_grouped)
links_freq_pos_grouped_df

In [None]:
# visualization of the Distribution of the clicked hyperlinks' positioning

links_freq_pos.sort_values(by='#_clicks', ascending=False, inplace=True)
#links_freq_pos=links_freq_pos[:1000] # may be interesting to 1000 most clicked articles 
#links_freq_pos=links_freq_pos[:-1000] # 1000 least clicked articles 

ax = links_freq_pos_grouped_df.plot.bar()
ax.set_title('Distribution of the clicked hyperlinks positioning')
ax.set_xlabel('Positioning')

In [None]:
# plot distribution of hyperlinks in the text

In [None]:
def find_positioning_dis(array):
    bin_labels=['top', 'center', 'bottom']
    bins=[0, 0.33, 0.66, 1]
    cut_positioning=pd.cut(array, bins, labels=bin_labels)
    positioning_frequency=pd.value_counts(cut_positioning)
    return positioning_frequency

In [None]:
positioning_freq=links_pos_dfs_merged.set_index("link").groupby("link").apply(lambda x: find_positioning_dis(x.values.flatten()))
positioning_freq_df=pd.DataFrame(positioning_freq, columns=['positioning_freq'])
positioning_freq_df.reset_index(level=1, inplace=True)
positioning_freq_df.rename(columns={'level_1':'positioning'}, inplace=True)
positioning_freq_df.head(20)

In [None]:
positioning_freq_df_grouped=positioning_freq_df.groupby('positioning').sum()
positioning_freq_df_grouped
#positioning_freq_df_grouped.loc[positioning_freq_df_grouped['positioning_freq']>1]

In [None]:
ax = positioning_freq_df_grouped.plot.bar()
ax = links_freq_pos_grouped_df.plot.bar()
ax.set_title('Distribution of hyperlinks positioning')
ax.set_xlabel('Positioning')

In [None]:
# code for finding hyperlinks in images' caption

In [None]:
# example with 1 file

In [None]:
URL = '/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wpcd/wp/a/Aachen.htm'
f2=open(URL, 'r')
soup2 = BeautifulSoup(f2, 'lxml')

In [None]:
# in an HTML file the images can be either find under tables or div tags 

In [None]:
# images of hyperlinks in tables' captions

def find_link_in_table(soup):
    
    tables = soup.find_all('table')

    links=[]
    for table in tables:
        if table.find('img'):
            all_links=table.find_all('a')
            for link in all_links:
                if 'class' not in link.attrs:
                    link_title=link.text
                    links.append(link_title)

    if links:
        links_in_table_df=pd.DataFrame(links).drop_duplicates()
        links_in_table_df.columns=['link']
        links_in_table_df=links_in_table_df.apply(filter_article_name) 
    else: 
        return None
    return links_in_table_df

find_link_in_table(soup2)

In [None]:
links_in_tables=[]
for soup in soups:
    links_in_tables.append(find_link_in_table(soup))

links_in_tables=pd.concat(links_in_tables, axis=0)
links_in_tables

In [None]:
# images under div tags
def find_links_in_img(soup):
    div = soup.find_all('div')

    links=[]

    for d in div:
        if (d.find('a', class_="internal")):
            if (d.find('img', class_='thumbimage')):
                caption=d.find('div', class_="thumbcaption")
                if (caption.find('a')):
                    all_links=caption.find_all('a')
                    for link in all_links:
                        if 'class' not in link.attrs:
                            link_title=link.text
                            links.append(link_title)
    if links:
        links_in_image_df=pd.DataFrame(links).drop_duplicates()
        links_in_image_df.columns=['link']
        links_in_image_df=links_in_image_df.apply(filter_article_name) 
    else: 
        return None
        links_in_image_df=pd.DataFrame()
    return links_in_image_df


find_links_in_img(soup2)

In [None]:
find_links_in_img(soups[0])

In [None]:
links_in_images=[]
for soup in soups:
    #if get_hrefs_pics(soup):
    links_in_images.append(find_links_in_img(soup))

links_in_images=pd.concat(links_in_images, axis=0)
links_in_images

In [None]:
len(soups)

In [None]:
# function for all html files

def find_links_in_div(soups):
    dfs=[]
    for soup in soups:
        div = soup.find_all('div')
        links=[]
        for d in div:
            if (d.find('a', class_="internal")):
                print('yes')
                if d.find('div', class_="thumbcaption"):
                    print('yes, yes')
                    caption=d.find('div', class_="thumbcaption")
                    if (caption.find('a')):
                        all_links=caption.find_all('a')
                        print(all_links)
                    
                        if len(all_links)>1:
                            print(len(all_links))
                            for link in all_links:
                                if 'class' not in link.attrs:
                                
                                    link_title=link.text
                                    links.append(link_title)
                            
        if link:
            links_in_image_df=pd.DataFrame(links).drop_duplicates()
            links_in_image_df.columns=['link_in_image']
            links_in_image_df=links_in_image_df.apply(filter_article_name)     
    return dfs

links_in_image_dfs=find_links_in_div(soups)
links_in_image_dfs

In [None]:
# code for frequency of clicks for link in images 

In [None]:
# example with 2 files

In [None]:
links_in_image_dfs=[links_in_image_df, links_in_table_df] #not needed for dataframes of all articles
link_in_image_concatenated=pd.concat(links_in_image_dfs, axis=0).reset_index(drop=True).drop_duplicates()
link_in_image_concatenated

In [None]:
flag=[]
links_freq_img=links_freq_df.copy()

for link in links_freq_df['link']:
    if link in list(link_in_image_concatenated['link']):
        flag.append(1)
    else:
        flag.append(0)
    
links_freq_img['in_image']=flag
links_freq_img[links_freq_img['in_image']==1]

In [None]:
links_freq_img_grouped_df=pd.DataFrame(links_freq_img['#_clicks'].groupby(links_freq_img['in_image']).sum())
links_freq_img_grouped_df

In [None]:
# visualization of img  

ax = links_freq_img_grouped_df.plot.bar()
ax.set_title('Distribution of the clicked hyperlinks in images captions')
ax.set_xlabel('In image caption')

In [None]:
# assumptions: 
# we don't know for the hyperlinks that show up both in the article text and in the image caption, 
# which the player actually clicked 
# for the hyperlinks which show up multiple times in the article ...