In [1]:
import pandas as pd
import numpy as np

In [2]:
# code for reading all HTLM files in the directory

The aim of this specific game strategy analysis is to test whether Wikispeedia players have clikability preferences on the positioning of the hyperlinks in the article text. Firstly we will address their postioning in the text, evaluated on the basis of the ordinal number of the paragraph they show up in; secondly we will test if there is a correlation between the clickthrough rate of the hyperlinks and their location in image captions. 

It is therefore essential to extract from the data the clicktrhough rate for each hyperlink.

In [3]:
# code for extracting the clickthrough rate of hyperlinks

In [4]:
folder='/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wikispeedia_paths-and-graph/'

paths_finished='paths_finished.tsv'
articles='articles.tsv'

paths_finished=pd.read_csv(folder+paths_finished, sep='\t', skiprows=15, header=None, names=['hashedIpAddress', 'timestamp', 'durationInSec', 'path', 'rating'])
articles=pd.read_csv(folder+articles, sep='\t', skiprows=12, header=None, names=['article'])

This dataframe contains the paths chosen by the players who could reach the target article.

In [5]:
paths_finished.head()

Unnamed: 0,hashedIpAddress,timestamp,durationInSec,path,rating
0,6a3701d319fc3754,1297740409,166,14th_century;15th_century;16th_century;Pacific...,
1,3824310e536af032,1344753412,88,14th_century;Europe;Africa;Atlantic_slave_trad...,3.0
2,415612e93584d30e,1349298640,138,14th_century;Niger;Nigeria;British_Empire;Slav...,
3,64dd5cd342e3780c,1265613925,37,14th_century;Renaissance;Ancient_Greece;Greece,
4,015245d773376aab,1366730828,175,14th_century;Italy;Roman_Catholic_Church;HIV;R...,3.0


This dataframe lists all the clickable articles.

In [6]:
articles.head()

Unnamed: 0,article
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in
1,%C3%85land
2,%C3%89douard_Manet
3,%C3%89ire
4,%C3%93engus_I_of_the_Picts


In [7]:
# this is the list articles in the lists of paths

clicked_links=[]
for x in paths_finished['path']:
    path=(x.split(';'))
    clicked_links.append(path)

clicked_links[:10]

[['14th_century',
  '15th_century',
  '16th_century',
  'Pacific_Ocean',
  'Atlantic_Ocean',
  'Accra',
  'Africa',
  'Atlantic_slave_trade',
  'African_slave_trade'],
 ['14th_century',
  'Europe',
  'Africa',
  'Atlantic_slave_trade',
  'African_slave_trade'],
 ['14th_century',
  'Niger',
  'Nigeria',
  'British_Empire',
  'Slavery',
  'Africa',
  'Atlantic_slave_trade',
  'African_slave_trade'],
 ['14th_century', 'Renaissance', 'Ancient_Greece', 'Greece'],
 ['14th_century',
  'Italy',
  'Roman_Catholic_Church',
  'HIV',
  'Ronald_Reagan',
  'President_of_the_United_States',
  'John_F._Kennedy'],
 ['14th_century',
  'Europe',
  'North_America',
  'United_States',
  'President_of_the_United_States',
  'John_F._Kennedy'],
 ['14th_century', 'China', 'Gunpowder', 'Fire'],
 ['14th_century', 'Time', 'Isaac_Newton', 'Light', 'Color', 'Rainbow'],
 ['14th_century', 'Time', 'Light', 'Rainbow'],
 ['14th_century',
  '15th_century',
  'Plato',
  'Nature',
  'Ultraviolet',
  'Color',
  'Rainbow']]

In [8]:
# this is the dataframe containing the hyperlinks and their clicktrhough rates

links_name=[]
links_freq=[]
count=0
i=0
for article in articles['article']:
    links_name.append(article)
    for i in range(len(clicked_links)):
        if article in clicked_links[i]:
            count+=1 
    links_freq.append(count)

links_name=pd.Series(links_name)
links_freq=pd.Series(links_freq)
links_freq_df=pd.concat([links_name, links_freq], axis=1, names=['link', 'clickthrough_rate'])
links_freq_df.columns=['link', 'clickthrough_rate']
links_freq_df.head(10)

Unnamed: 0,link,clickthrough_rate
0,%C3%81ed%C3%A1n_mac_Gabr%C3%A1in,0
1,%C3%85land,2
2,%C3%89douard_Manet,4
3,%C3%89ire,7
4,%C3%93engus_I_of_the_Picts,7
5,%E2%82%AC2_commemorative_coins,8
6,10th_century,117
7,11th_century,232
8,12th_century,369
9,13th_century,497


In [9]:
# code for extracting the positioning of the hyperlinks in the article text

In [10]:
from bs4 import BeautifulSoup

In [11]:
URL = '/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wpcd/wp/a/A_cappella.htm'

f=open(URL, 'r')
soup = BeautifulSoup(f, 'lxml')

In [12]:
# Andres' code

#for i, p in enumerate(soup.find_all("p")):
#    as_in_p = p.find_all("a")
#    for a in as_in_p:
#        print(f"{i+1:<10} {a.text:<20} {a.get('href')}")

In [13]:
paragraphs = soup.find_all('p')
print('The total number of paragraphs is {0}.'.format(len(paragraphs)))

The total number of paragraphs is 24.


In [14]:
# HTML tag for hyperlinks is <a href= >
# this is the dataframe of the hyperlinks contained in an article, with the number of paragraph they show up in 
# and its relative postion in the article

links=[]
i=0

for p in paragraphs:
    i+=1
    all_links=p.find_all('a')
    for link in all_links:
        if 'href' in link.attrs:
            link_title=link.text
            link_paragraph=i
            links.append([link_title, link_paragraph])

link_positioning_df=pd.DataFrame(links, columns=['link', '#_paragraph'])
link_positioning_df['positioning']=round(link_positioning_df['#_paragraph']/len(paragraphs), 2)
link_positioning_df

Unnamed: 0,link,#_paragraph,positioning
0,music,1,0.04
1,instrumental,1,0.04
2,Renaissance,1,0.04
3,Gregorian chant,3,0.12
4,Renaissance,3,0.12
5,Amish,5,0.21
6,Eastern Orthodox,5,0.21
7,Jewish,8,0.33
8,Beatles,18,0.75
9,Hindi,21,0.88


In [15]:
# code for finding hyperlinks in images' caption

In [16]:
URL = '/Users/ginevralarroux/Desktop/EPFL courses/Applied data analysis/ADA project/data/wpcd/wp/a/Aachen.htm'
f=open(URL, 'r')
soup2 = BeautifulSoup(f, 'lxml')

In [17]:
# in an HTML file the images can be either find under tables or div tags 

In [18]:
tables = soup2.find_all('table')
print('The total number of tables is {0}.'.format(len(tables)))

The total number of tables is 2.


In [19]:
img = soup2.find_all('img')
print('The total number of images is {0}.'.format(len(img)))

The total number of images is 24.


In [20]:
# images under table tags
# dataframe of hyperlinks in tables' captions

links=[]

for table in tables:
    if (table.img):
        all_links= table.find_all('a')
        for link in all_links:
            if 'class' not in link.attrs:
                link_title=link.text
                links.append(link_title)
        
links_in_table_df=pd.DataFrame(links)
links_in_table_df.columns=['link_in_table']
links_in_table_df

Unnamed: 0,link_in_table
0,Country
1,Germany


In [21]:
# images under div tags
# dataframe of hyperlinks in tables' captions

div = soup2.find_all('div')

links=[]

for d in div:
    if (d.find('a', class_="internal")):
        if (d.find('img', class_='thumbimage')):
            caption=d.find('div', class_="thumbcaption")
            if (caption.find('a')):
                all_links=caption.find_all('a')
                for link in all_links:
                    if 'class' not in link.attrs:
                        link_title=link.text
                        links.append(link_title)
                            
links_in_image_df=pd.DataFrame(links).drop_duplicates() 
links_in_image_df.columns=['link_in_image']
links_in_image_df    

Unnamed: 0,link_in_image
0,German
1,Dutch
2,Belgian
6,20th century


In [22]:
# when we'll have all the articles' dataframes we will merge them by link and get the average of their positioning
# we will sort the articles based on their clickthrough rate, ascending=False
# we will check if there is a correlation between the positioning and clickthrough rate

In [23]:
# when we'll have all the articles' dataframes we will label with a flag the hyperlinks located in images' captions
# we will check if there is a correlation between the location in an image caption and clickthrough rate

In [24]:
# assumptions: 
# we don't know for the hyperlinks that show up both in the article text and in the image caption, 
# which the player actually clicked 
# for the hyperlinks which show up multiple times in the article ...
