In [43]:
import requests
import json
import pandas as pd
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
'''
This collects all people with occupation 'Singer' (Q177220) 'Songwriter' (Q753110), 'Rapper' (Q2252262), 'Guitarist' (Q855091)
and also all entities of category 'Musical Group' (Q215380)
It only selects unique entries, with a specified 'work period (start)' that starts from 1960
and also who have received at least an award.

Total number of entries is 5367, which is manageable.
'''
# Define the SPARQL query for fetching entity labels
query = """
SELECT DISTINCT ?entity ?entityLabel WHERE {
  {
    VALUES ?occupation {wd:Q177220 wd:Q753110 wd:Q2252262 wd:Q855091}
    ?entity wdt:P106 ?occupation;
            wdt:P166 ?award;
            wdt:P2031 ?workStart.
    FILTER(YEAR(?workStart) >= 1960)
  } UNION {
    ?entity wdt:P31 wd:Q215380;
            wdt:P166 ?award;
            wdt:P2031 ?groupWorkStart.
    FILTER(YEAR(?groupWorkStart) >= 1960)
  }
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
LIMIT 5400
"""

# Set the endpoint URL and headers
url = "https://query.wikidata.org/sparql"
headers = {"Accept": "application/sparql-results+json"}

# Make the request
response = requests.get(url, headers=headers, params={'query': query})
data = response.json()

# Initialize an empty list to store names
people_names = []

# Iterate through the results and add names to the list
for result in data['results']['bindings']:
    people_names.append(result['entityLabel']['value'])

# Optionally, print the names or the total number of names collected
print(f"Collected a total of {len(people_names)} names.")

In [3]:
people_names

['Bruce Springsteen',
 'Alexandra Stan',
 'Neil Young',
 'Madonna',
 'Renaud',
 'Bob Marley',
 'Janis Joplin',
 'Kate Bush',
 'Patti Smith',
 'Cyndi Lauper',
 'Stevie Wonder',
 'Bruno Mars',
 'Bob Dylan',
 'Brigitte Fontaine',
 'Bruce Willis',
 'Mike Oldfield',
 'Stefanie Heinzmann',
 'Michael Jackson',
 'Barbra Streisand',
 'Céline Dion',
 'Elton John',
 'Miley Cyrus',
 'Sasha Grey',
 'Daniel Balavoine',
 'Miranda Cosgrove',
 'Eminem',
 'Snoop Dogg',
 'David Bowie',
 'Salman Khan',
 'Beck',
 'Prince',
 'Heino',
 'Aaliyah',
 'Britney Spears',
 'Tupac Shakur',
 'Lena Meyer-Landrut',
 'Amitabh Bachchan',
 'Whoopi Goldberg',
 'Kana Asumi',
 'Eleni Tzoka',
 'Alain Souchon',
 'Haruka Tomatsu',
 'Thomas Fersen',
 'Minako Kotobuki',
 'Molly Sandén',
 'Tim Curry',
 'Krzysztof Grabowski',
 'Françoise Hardy',
 'Emiri Katō',
 'Jane Zhang',
 'Vanessa Paradis',
 'Kanae Itō',
 'Jolin Tsai',
 'Carly Rae Jepsen',
 'Ray William Johnson',
 'Yui Horie',
 'Marisa Monte',
 'Pedro Almodóvar',
 'Patrick Sway

In [9]:
# Initialization
base_url = "https://en.wikipedia.org/w/api.php"
session = requests.Session()
headers = {'User-Agent': 'YourAppName/1.0 (your-email@example.com)'}
names_2023_visits = []
article_found_counter = 0

# Loop through each name in the people_names list
for name in people_names:
    # Encode the name for URL
    page_title_formatted = requests.utils.quote(name.replace(' ', '_'))
    
    # Initialize total views for 2023
    total_views_2023 = 0
    
    # Construct the URL for fetching pageviews
    pageviews_url = f"https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/{page_title_formatted}/monthly/20230101/20231231"
    
    # Make the request
    response_pageviews = requests.get(pageviews_url, headers=headers)
    if response_pageviews.status_code == 200:
        pageviews_data = response_pageviews.json().get('items', [])
        
        # Sum up views for 2023
        for item in pageviews_data:
            total_views_2023 += item.get('views', 0)
        
        # Check if any views were found and append to the data list
        if total_views_2023 > 0:
            names_2023_visits.append({"name": name, "2023_visits": total_views_2023})
            article_found_counter += 1
        else:
            print(f"No views data found for {name}")
    else:
        print(f"Failed to fetch pageviews for {name}")

# After collecting data, create a DataFrame
names_visits_df = pd.DataFrame(names_2023_visits)

# Print the number of articles found and the total number of names processed
print(f"{article_found_counter} articles found with views data, out of the {len(people_names)} in the list")

Failed to fetch pageviews for AC/DC
Failed to fetch pageviews for Anastasiia Prykhodko
Failed to fetch pageviews for Bloodsucking Zombies From Outer Space
Failed to fetch pageviews for Lee Jeong-hyeon
Failed to fetch pageviews for Polad Bulbuloghlu
Failed to fetch pageviews for Band ohne Namen
No views data found for deLillos
Failed to fetch pageviews for Ewa Bem
Failed to fetch pageviews for Lyna bonasera
Failed to fetch pageviews for Tammin Pamela Sursok
Failed to fetch pageviews for María Ostiz
No views data found for k-os
Failed to fetch pageviews for Darius & Finlay
Failed to fetch pageviews for Françis Bebey
No views data found for Ryan Conner
Failed to fetch pageviews for Marie-Paule Belle
Failed to fetch pageviews for Jang Dong-geon
Failed to fetch pageviews for Petr Kotvald
Failed to fetch pageviews for Allain Leprest
Failed to fetch pageviews for Fruzsina Kovácsovics
Failed to fetch pageviews for Clowns & Helden
Failed to fetch pageviews for Galandum Galundaina
Failed to fetc

Failed to fetch pageviews for Sandra Kolstad
No views data found for Pop Design
Failed to fetch pageviews for Amy Search
No views data found for Čuki
Failed to fetch pageviews for Seikō Itō
Failed to fetch pageviews for Isao Bitō
Failed to fetch pageviews for Jazz Band Ball Orchestra
Failed to fetch pageviews for Krystyna Prońko
Failed to fetch pageviews for Ushirokara Haiyoritai
Failed to fetch pageviews for Leszek Wójtowicz
Failed to fetch pageviews for Djevel
Failed to fetch pageviews for Nataliya Shelepnytska
Failed to fetch pageviews for Krystyna Świątecka
Failed to fetch pageviews for Jan Wojdak
Failed to fetch pageviews for Montée
No views data found for Asamisimasa
Failed to fetch pageviews for Miyako Ōtsuki
Failed to fetch pageviews for Karyudo
Failed to fetch pageviews for Lech Czerkas
Failed to fetch pageviews for Anna Roig i L'ombre de ton chien
Failed to fetch pageviews for Kumiko Mori
Failed to fetch pageviews for Akiko Kanazawa
Failed to fetch pageviews for Nacho Mañó
Fa

Failed to fetch pageviews for MC Cabelinho
Failed to fetch pageviews for Sasha Popova
Failed to fetch pageviews for Natalya Gerasimova
Failed to fetch pageviews for Lucía Muñoz Maldonado
Failed to fetch pageviews for Belén Aguilera
Failed to fetch pageviews for Cris MJ
Failed to fetch pageviews for Rayne Almeida
Failed to fetch pageviews for Alenka Star Be
Failed to fetch pageviews for Ayaz Babayev
Failed to fetch pageviews for Sandra Guida
Failed to fetch pageviews for Froukje
Failed to fetch pageviews for Xamdam Sobirov
Failed to fetch pageviews for Kamila Cree
Failed to fetch pageviews for Liza Myalik
Failed to fetch pageviews for Sara Fajira
Failed to fetch pageviews for Mushkaa
Failed to fetch pageviews for Marcianeke
Failed to fetch pageviews for Fathia Izzati
Failed to fetch pageviews for Kristina Ramazanova
Failed to fetch pageviews for Röstäm Mälikef
Failed to fetch pageviews for ADONXS
Failed to fetch pageviews for Xäbib
Failed to fetch pageviews for Färidä Safina
Failed to f

In [29]:
# How to handle AC/DC?
name='AC/DC'
total_views_2023 = 0

headers = {'User-Agent': 'YourAppName/1.0 (your-email@example.com)'}

# %2F is the url encoding for the character '/'
pageviews_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/en.wikipedia/all-access/user/AC%2FDC/monthly/20230101/20231231"

response_pageviews = requests.get(pageviews_url, headers=headers)
if response_pageviews.status_code == 200:
    pageviews_data = response_pageviews.json().get('items', [])

    for item in pageviews_data:
        total_views_2023 += item.get('views', 0)

    if total_views_2023 > 0:
        #names_2023_visits.append({"name": name, "2023_visits": total_views_2023})
        print(f'{name}, {total_views_2023}')
    else:
        print(f"No views data found for {name}")
else:
    print(f"Failed to fetch pageviews for {name}") 

AC/DC, 2565212


In [57]:
names_visits_df=pd.concat([names_visits_df, pd.DataFrame({'name': 'AC/DC', '2023_visits': 2565212}, index=[len(names_visits_df)+2])])

In [58]:
names_visits_df[names_visits_df['name']=='AC/DC']

Unnamed: 0,name,2023_visits
4921,AC/DC,2565212


In [59]:
names_visits_df.sort_values(by='2023_visits', ascending=False, inplace=True)
names_visits_df.head(10)

Unnamed: 0,name,2023_visits
0,Taylor Swift,22030018
1,XXXTentacion,20104479
2,Alia Bhatt,12669342
3,Tina Turner,11590403
4,Keanu Reeves,9916477
5,Rihanna,9885357
6,Bruce Willis,9468530
7,Michael Jackson,9129164
8,Ryan Gosling,8766941
9,Selena Gomez,7550367


In [17]:
names_visits_df.drop_duplicates(inplace=True)

In [18]:
top_2023_visits_df=names_visits_df[0:1000]
top_2023_visits_df

Unnamed: 0,name,2023_visits
0,Taylor Swift,22030018
1,XXXTentacion,20104479
2,Alia Bhatt,12669342
3,Tina Turner,11590403
4,Keanu Reeves,9916477
...,...,...
995,Crowded House,348751
996,Hwasa,348549
997,CeCe Winans,348064
998,Devi Sri Prasad,347937


In [19]:
top_2023_visits_df.to_csv('top_visited_2023.csv', index=False)