In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON
from geopy.extra.rate_limiter import RateLimiter
from geopy.geocoders import Nominatim
from google.colab import files
import pandas as pd
import requests
import time
import wikipediaapi
pip install SPARQLWrapper
pip install geopy
pip install wikipedia-api

### Installation and Import of SPARQLWrapper Library
This section installs `SPARQLWrapper` to enable querying RDF databases using SPARQL directly from Python.

In [None]:
pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-7.0.0


### Fetching Data Using SPARQL
This function queries the Wikidata endpoint to retrieve birthplaces of singers using SPARQL, demonstrating how to interact with semantic web data.

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_singers_birthplaces():
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    query = """
    SELECT ?personLabel ?birthplaceLabel WHERE {
      ?person wdt:P106 wd:Q177220;       # Occupation singer
              wdt:P19 ?birthplace.      # Place of birth
      SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
    }
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    singers = []
    for result in results["results"]["bindings"]:
        person_name = result["personLabel"]["value"]
        birthplace = result["birthplaceLabel"]["value"]
        singers.append(f"{person_name}: Born in {birthplace}")
    return singers

# Fetch and print the list of singers and their birthplaces
singers_list = get_singers_birthplaces()
for singer in singers_list:
    print(singer)


Eric Saade: Born in Kattarp
Eldar Qasımov: Born in Baku
Nigar Jamal: Born in Baku
Inna: Born in Mangalia
Beth Ditto: Born in Searcy
Willy Moon: Born in Wellington
Moby: Born in Harlem
Hank Mizell: Born in Daytona Beach
Pharrell Williams: Born in Virginia Beach
Phoebe Cates-Kline: Born in Manhattan
Mac DeMarco: Born in Duncan
Sharon Van Etten: Born in Belleville
Terry Callier: Born in Chicago
Mats Levén: Born in Gothenburg
Ville Kantee: Born in Joutseno
Lil Wayne: Born in New Orleans
Freddie Mercury: Born in Zanzibar City
Brian May: Born in Hampton
Roger Taylor: Born in Dersingham
Amy Macdonald: Born in Bishopbriggs
Kanye West: Born in Atlanta
William Shatner: Born in Notre-Dame-de-Grâce
Clive Sarstedt: Born in Ajmer
Robert Mitchum: Born in Bridgeport
Andy Lau: Born in Tai Po
Jacky Cheung: Born in British Hong Kong
Daniel Boemle: Born in Bern
Leon Lai: Born in Beijing
Matti Nykänen: Born in Jyväskylä
Aaron Kwok: Born in Hong Kong
Vasco Rossi: Born in Zocca
Claudio Sanchez: Born in Suffe

### Direct HTTP Requests for Artist Information
Using direct HTTP requests to retrieve artist birthplaces from Wikidata, this shows a manual approach to sending queries and processing responses.

In [None]:
import requests

def fetch_artist_birthplaces(artist_names, limit=10):
    """Fetches birthplaces for a list of artist names using direct HTTP requests to the Wikidata endpoint."""
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'application/sparql-results+json'
    }
    artist_birthplaces = []

    for artist_name in artist_names[:limit]:  # Limiting to a few artists for demonstration
        query = f"""
        SELECT ?artistLabel ?birthplaceLabel WHERE {{
          ?artist wdt:P106 wd:Q177220;  # Occupation singer
                  rdfs:label "{artist_name}"@en;
                  wdt:P19 ?birthplace.
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        LIMIT 1
        """
        response = requests.get(wikidata_endpoint, headers=headers, params={'query': query, 'format': 'json'})
        data = response.json()

        if data["results"]["bindings"]:
            for result in data["results"]["bindings"]:
                birthplace = result["birthplaceLabel"]["value"] if "birthplaceLabel" in result else "Unknown"
                artist_birthplaces.append((artist_name, birthplace))
        else:
            artist_birthplaces.append((artist_name, "Birthplace not found"))

    return artist_birthplaces

# Test fetching birthplaces for a small subset of artists
sample_artists = ['Taylor Swift', 'Michael Jackson', 'Rihanna']  # Example artist names
artist_birthplaces = fetch_artist_birthplaces(sample_artists)
artist_birthplaces


[('Taylor Swift', 'West Reading'),
 ('Michael Jackson', 'Gary'),
 ('Rihanna', 'Saint Michael')]

### Direct HTTP Requests for Artist Information
Using direct HTTP requests to retrieve artist birthplaces from Wikidata, this shows a manual approach to sending queries and processing responses.

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NUMBER_of_views")
df

Unnamed: 0.1,Unnamed: 0,article,date,views
0,0,Taylor Swift,2023-01-01,29705
1,1,Taylor Swift,2023-01-02,25766
2,2,Taylor Swift,2023-01-03,24682
3,3,Taylor Swift,2023-01-04,21214
4,4,Taylor Swift,2023-01-05,22873
...,...,...,...,...
356922,356922,Devi Sri Prasad,2023-12-27,588
356923,356923,Devi Sri Prasad,2023-12-28,636
356924,356924,Devi Sri Prasad,2023-12-29,706
356925,356925,Devi Sri Prasad,2023-12-30,686


### Direct HTTP Requests for Artist Information
Using direct HTTP requests to retrieve artist birthplaces from Wikidata, this shows a manual approach to sending queries and processing responses.

In [None]:
import requests
import pandas as pd

artists_data = df["article"]
unique_artists = artists_data.unique()  # Assuming 'article' is the column with artist names

def fetch_artist_birthplaces(artist_names):
    """Fetches birthplaces for all artist names in the DataFrame using direct HTTP requests to the Wikidata endpoint."""
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'application/sparql-results+json'
    }
    artist_birthplaces = []

    for artist_name in artist_names:  # Process all artists in the list
        query = f"""
        SELECT ?artistLabel ?birthplaceLabel WHERE {{
          ?artist wdt:P106 wd:Q177220;  # Occupation singer
                  rdfs:label "{artist_name}"@en;
                  wdt:P19 ?birthplace.
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        LIMIT 1
        """
        response = requests.get(wikidata_endpoint, headers=headers, params={'query': query, 'format': 'json'})
        data = response.json()

        if data["results"]["bindings"]:
            for result in data["results"]["bindings"]:
                birthplace = result["birthplaceLabel"]["value"] if "birthplaceLabel" in result else "Unknown"
                artist_birthplaces.append((artist_name, birthplace))
        else:
            artist_birthplaces.append((artist_name, "Birthplace not found"))

    return artist_birthplaces

# Fetching birthplaces for all artists
artist_birthplaces = fetch_artist_birthplaces(unique_artists)
artist_birthplaces


[('Taylor Swift', 'West Reading'),
 ('XXXTentacion', 'Plantation'),
 ('Alia Bhatt', 'India'),
 ('Tina Turner', 'Brownsville'),
 ('Keanu Reeves', 'Beirut'),
 ('Rihanna', 'Saint Michael'),
 ('Bruce Willis', 'Idar-Oberstein'),
 ('Michael Jackson', 'Gary'),
 ('Ryan Gosling', 'London'),
 ('Selena Gomez', 'Grand Prairie'),
 ('Whitney Houston', 'Newark'),
 ('Miley Cyrus', 'Nashville'),
 ('Scarlett Johansson', 'Manhattan'),
 ('Hannah Waddingham', 'Wandsworth'),
 ('Eminem', 'St. Joseph'),
 ('Kanye West', 'Atlanta'),
 ('Florence Pugh', 'Oxford'),
 ('Priyanka Chopra', 'Jamshedpur'),
 ('Tupac Shakur', 'Birthplace not found'),
 ('Freddie Mercury', 'Zanzibar City'),
 ('George Michael', 'East Finchley'),
 ('Zendaya', 'Oakland'),
 ('Johnny Depp', 'Birthplace not found'),
 ('Madonna', 'Bay City'),
 ('Cher', 'El Centro'),
 ('Adam Sandler', 'Brooklyn'),
 ('Dua Lipa', 'Westminster'),
 ('Salma Hayek', 'Coatzacoalcos'),
 ('Robert Downey Jr.', 'Manhattan'),
 ("Shaquille O'Neal", 'Newark'),
 ('Bradley Cooper'

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
birthplaces_df = pd.DataFrame(artist_birthplaces, columns=['Artist', 'Birthplace'])

# Save to CSV
output_csv_path = 'artist_birthplaces.csv'
birthplaces_df.to_csv(output_csv_path, index=False)

print(f"Data saved to {output_csv_path}. You can now download this file.")

Data saved to artist_birthplaces.csv. You can now download this file.


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
from google.colab import files

files.download("artist_birthplaces.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Installation and Import of SPARQLWrapper Library
This section installs `SPARQLWrapper` to enable querying RDF databases using SPARQL directly from Python.

In [None]:
pip install geopy




### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
import pandas as pd
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import time

# Load the CSV file
df = pd.read_csv('artist_birthplaces.csv')

# Initialize the geocoder with a user-agent (replace 'my_geocoder' with your application name)
geolocator = Nominatim(user_agent="my_geocoder")

# This will help manage the rate of your requests
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Define a function to get coordinates
def get_coordinates(city):
    try:
        location = geocode(city)
        if location:
            return location.latitude, location.longitude
        else:
            return None, None
    except:
        return None, None

# Apply the function to your DataFrame
df['coordinates'] = df['Birthplace'].apply(get_coordinates)

# Optional: split the coordinates into two columns for latitude and longitude
df[['latitude', 'longitude']] = pd.DataFrame(df['coordinates'].tolist(), index=df.index)

# Save the updated DataFrame to a new CSV file
df.to_csv('artist_birthplaces_with_coordinates.csv', index=False)

print(df.head())




         Artist    Birthplace                coordinates   latitude  longitude
0  Taylor Swift  West Reading  (40.3337038, -75.9474322)  40.333704 -75.947432
1  XXXTentacion    Plantation  (27.0637262, -82.3651631)  27.063726 -82.365163
2    Alia Bhatt         India   (22.3511148, 78.6677428)  22.351115  78.667743
3   Tina Turner   Brownsville  (25.9024289, -97.4981698)  25.902429 -97.498170
4  Keanu Reeves        Beirut     (33.8959203, 35.47843)  33.895920  35.478430


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
files.download("artist_birthplaces_with_coordinates.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
import pandas as pd

top_visited = pd.read_excel("/content/names_2023_visits.xlsx")
top_visited

Unnamed: 0,"name,2023_visits"
0,"Taylor Swift,22030018"
1,"XXXTentacion,20104479"
2,"Alia Bhatt,12669342"
3,"Tina Turner,11590403"
4,"Keanu Reeves,9916477"
...,...
4914,"Martine Bijl,1"
4915,"k.d. lang,1"
4916,"apl.de.ap,1"
4917,"Noncho Vodenicharov,1"


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
split_data = top_visited['name,2023_visits'].str.split(',', expand=True)
if split_data.shape[1] == 2:  # Assicurati che ci siano esattamente 2 colonne dopo la divisione
    top_visited[['name', '2023views']] = split_data

top_visited.head()

Unnamed: 0,"name,2023_visits"
0,"Taylor Swift,22030018"
1,"XXXTentacion,20104479"
2,"Alia Bhatt,12669342"
3,"Tina Turner,11590403"
4,"Keanu Reeves,9916477"


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
df = split_data.dropna(axis=1)
df = df.rename(columns={0 : "Artist",
                   1: "views"})
print(df.head())

         Artist     views
0  Taylor Swift  22030018
1  XXXTentacion  20104479
2    Alia Bhatt  12669342
3   Tina Turner  11590403
4  Keanu Reeves   9916477


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
df = df.iloc[:996]

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
data2 = pd.read_csv("/content/artist_birthplaces_with_coordinates.csv")
data2

Unnamed: 0,Artist,Birthplace,coordinates,latitude,longitude
0,Taylor Swift,West Reading,"(40.3337038, -75.9474322)",40.333704,-75.947432
1,XXXTentacion,Plantation,"(27.0637262, -82.3651631)",27.063726,-82.365163
2,Alia Bhatt,India,"(22.3511148, 78.6677428)",22.351115,78.667743
3,Tina Turner,Brownsville,"(25.9024289, -97.4981698)",25.902429,-97.498170
4,Keanu Reeves,Beirut,"(33.8959203, 35.47843)",33.895920,35.478430
...,...,...,...,...,...
991,Heather Headley,Barataria,"(29.704083500000003, -90.11145959181923)",29.704084,-90.111460
992,Crowded House,Birthplace not found,"(None, None)",,
993,Hwasa,Jeonju,"(35.8237631, 127.1472805)",35.823763,127.147280
994,CeCe Winans,Detroit,"(42.3315509, -83.0466403)",42.331551,-83.046640


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
views2023_df = df["views"].to_frame('views')

# Then you can merge using the correct column names
data = data2.merge(df[["Artist", 'views']], on='Artist', how='left')

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
from google.colab import files
data.to_csv("mapviews.csv", index = False)
files.download("mapviews.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/top_visits_per_day")
data

Unnamed: 0.1,Unnamed: 0,date,top_artists,top_1,top_2,top_3
0,0,2023-01-01,"['David Byrne', 'Miley Cyrus', 'The Pointer Si...",David Byrne,Miley Cyrus,The Pointer Sisters
1,1,2023-01-02,"['Tanya Tucker', 'The Pointer Sisters', 'Tammy...",Tanya Tucker,The Pointer Sisters,Tammy Wynette
2,2,2023-01-03,"['Whitney Houston', 'Tammy Wynette', 'XXXTenta...",Whitney Houston,Tammy Wynette,XXXTentacion
3,3,2023-01-04,"['XXXTentacion', 'Whitney Houston', 'Tammy Wyn...",XXXTentacion,Whitney Houston,Tammy Wynette
4,4,2023-01-05,"['Shania Twain', 'XXXTentacion', 'Whitney Hous...",Shania Twain,XXXTentacion,Whitney Houston
...,...,...,...,...,...,...
360,360,2023-12-27,"['XXXTentacion', 'Salman Khan', 'Alia Bhatt']",XXXTentacion,Salman Khan,Alia Bhatt
361,361,2023-12-28,"['XXXTentacion', 'Cher', 'Queen Latifah']",XXXTentacion,Cher,Queen Latifah
362,362,2023-12-29,"['XXXTentacion', 'Taylor Swift', 'Cher']",XXXTentacion,Taylor Swift,Cher
363,363,2023-12-30,"['XXXTentacion', 'Tina Turner', 'Taylor Swift']",XXXTentacion,Tina Turner,Taylor Swift


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
unique_top_1 = data["top_1"].unique()

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
unique_artists = pd.unique(data[['top_1', 'top_2', 'top_3']].values.ravel('K'))
unique_artists

array(['David Byrne', 'Tanya Tucker', 'Whitney Houston', 'XXXTentacion',
       'Shania Twain', 'Rod Stewart', 'David Bowie', 'Tammy Wynette',
       'Anna Kendrick', 'Jeff Beck', 'Michael Jackson', 'Shakira',
       'Miley Cyrus', 'David Crosby', 'Paris Hilton', 'Anita Baker',
       'Penny Marshall', 'Austin Butler', 'Vani Jairam', 'Kim Petras',
       'Salma Hayek', 'Rihanna', 'Raquel Welch', 'Bruce Willis',
       'Hayden Panettiere', 'Lewis Capaldi', 'Linda Ronstadt',
       'Olivia Newton-John', 'Sally Field', 'Steven Seagal', 'Ed Sheeran',
       'Selena Gomez', 'Lynyrd Skynyrd', 'Niall Horan', 'Michael Bolton',
       'Stevie Nicks', 'Sarah Polley', 'Taylor Swift', 'Toni Collette',
       'Keanu Reeves', 'Gwyneth Paltrow', 'Adam Sandler', 'Lainey Wilson',
       'Luis Enrique', 'S Club 7', 'Joe Alwyn', 'The Beach Boys',
       'Jon Bon Jovi', 'Steven Yeun', 'Jamie Foxx', 'Frank Ocean',
       'Avicii', 'Mark Selby', 'Jared Leto', 'Carrie Fisher',
       'Katy Perry', 'Alanis Mo

### Installation and Import of SPARQLWrapper Library
This section installs `SPARQLWrapper` to enable querying RDF databases using SPARQL directly from Python.

In [None]:
pip install SPARQLWrapper

Collecting SPARQLWrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Collecting rdflib>=6.1.1 (from SPARQLWrapper)
  Downloading rdflib-7.0.0-py3-none-any.whl (531 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m531.9/531.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting isodate<0.7.0,>=0.6.0 (from rdflib>=6.1.1->SPARQLWrapper)
  Downloading isodate-0.6.1-py2.py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.7/41.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: isodate, rdflib, SPARQLWrapper
Successfully installed SPARQLWrapper-2.0.0 isodate-0.6.1 rdflib-7.0.0


### Installation and Import of SPARQLWrapper Library
This section installs `SPARQLWrapper` to enable querying RDF databases using SPARQL directly from Python.

In [None]:
from SPARQLWrapper import SPARQLWrapper, JSON

def get_artist_images(artist_list):
    sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
    values_string = " ".join(f'"{artist}"' for artist in artist_list)  # Create a string for the SPARQL VALUES clause
    print("Debug - SPARQL VALUES Clause:", values_string)  # Debug print to check the VALUES clause

    query = f"""
    SELECT ?artist ?artistLabel ?image WHERE {{
      VALUES ?artistLabel {{{values_string}}}
      ?artist rdfs:label ?artistLabel;
              wdt:P18 ?image.
      FILTER(LANG(?artistLabel) = "en")
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    LIMIT 100
    """
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    images = []
    for result in results["results"]["bindings"]:
        artist_name = result["artistLabel"]["value"]
        image_url = result["image"]["value"]
        images.append((artist_name, image_url))
    return images

# Example usage with a list of artist names
artist_images = get_artist_images(unique_artists)  # Replace with your actual list of unique artists
for artist, url in artist_images:
    print(artist, url)


Debug - SPARQL VALUES Clause: "David Byrne" "Tanya Tucker" "Whitney Houston" "XXXTentacion" "Shania Twain" "Rod Stewart" "David Bowie" "Tammy Wynette" "Anna Kendrick" "Jeff Beck" "Michael Jackson" "Shakira" "Miley Cyrus" "David Crosby" "Paris Hilton" "Anita Baker" "Penny Marshall" "Austin Butler" "Vani Jairam" "Kim Petras" "Salma Hayek" "Rihanna" "Raquel Welch" "Bruce Willis" "Hayden Panettiere" "Lewis Capaldi" "Linda Ronstadt" "Olivia Newton-John" "Sally Field" "Steven Seagal" "Ed Sheeran" "Selena Gomez" "Lynyrd Skynyrd" "Niall Horan" "Michael Bolton" "Stevie Nicks" "Sarah Polley" "Taylor Swift" "Toni Collette" "Keanu Reeves" "Gwyneth Paltrow" "Adam Sandler" "Lainey Wilson" "Luis Enrique" "S Club 7" "Joe Alwyn" "The Beach Boys" "Jon Bon Jovi" "Steven Yeun" "Jamie Foxx" "Frank Ocean" "Avicii" "Mark Selby" "Jared Leto" "Carrie Fisher" "Katy Perry" "Alanis Morissette" "Hannah Waddingham" "Nick Cannon" "Donna Summer" "Tina Turner" "Astrud Gilberto" "Barry Manilow" "BTS" "Ezra Miller" "Gol

### Installation and Import of SPARQLWrapper Library
This section installs `SPARQLWrapper` to enable querying RDF databases using SPARQL directly from Python.

In [None]:
import requests

def fetch_main_image_for_artists(artist_list):
    base_url = "https://en.wikipedia.org/w/api.php"
    images = []

    for artist in artist_list:
        params = {
            "action": "query",
            "format": "json",
            "titles": artist,
            "prop": "pageimages",  # Property to fetch images
            "pithumbsize": 500     # Thumbnail size
        }

        response = requests.get(base_url, params=params)
        data = response.json()

        # Navigate through the JSON to find the main page image URL
        pages = data.get('query', {}).get('pages', {})
        for page_id, page in pages.items():
            thumbnail = page.get('thumbnail', {}).get('source')
            if thumbnail:
                images.append((artist, thumbnail))
            else:
                images.append((artist, "No image available"))

    return images


# Fetch the main images
artist_images = fetch_main_image_for_artists(unique_artists)
for artist, image_url in artist_images:
    print(artist, image_url)


David Byrne https://upload.wikimedia.org/wikipedia/commons/thumb/5/54/David_Byrne_San_Diego.jpg/500px-David_Byrne_San_Diego.jpg
Tanya Tucker https://upload.wikimedia.org/wikipedia/commons/thumb/3/3f/TanyaTuckerGraceland02202020.jpeg/500px-TanyaTuckerGraceland02202020.jpeg
Whitney Houston https://upload.wikimedia.org/wikipedia/commons/thumb/5/52/Whitney_Houston_%28cropped3%29.JPEG/500px-Whitney_Houston_%28cropped3%29.JPEG
XXXTentacion https://upload.wikimedia.org/wikipedia/commons/thumb/e/ee/Xxxtentacion_%28cropped%29.jpg/500px-Xxxtentacion_%28cropped%29.jpg
Shania Twain https://upload.wikimedia.org/wikipedia/commons/thumb/7/71/Shania_Twain_March_2020.png/500px-Shania_Twain_March_2020.png
Rod Stewart https://upload.wikimedia.org/wikipedia/commons/c/c0/Rod_Stewart_at_Xcel_Center_DSC_0470_%2814905955253%29_cropped.jpg
David Bowie https://upload.wikimedia.org/wikipedia/commons/e/e8/David-Bowie_Chicago_2002-08-08_photoby_Adam-Bielawski-cropped.jpg
Tammy Wynette https://upload.wikimedia.org/

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
df = pd.DataFrame(artist_images, columns=['Artist', 'Image_URL'])

df.to_csv('artist_images.csv', index=False)

df.head()

Unnamed: 0,Artist,Image_URL
0,David Byrne,https://upload.wikimedia.org/wikipedia/commons...
1,Tanya Tucker,https://upload.wikimedia.org/wikipedia/commons...
2,Whitney Houston,https://upload.wikimedia.org/wikipedia/commons...
3,XXXTentacion,https://upload.wikimedia.org/wikipedia/commons...
4,Shania Twain,https://upload.wikimedia.org/wikipedia/commons...


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
import pandas as pd
df = pd.read_csv("/content/drive/MyDrive/NUMBER_of_views")
df

Unnamed: 0.1,Unnamed: 0,article,date,views
0,0,Taylor Swift,2023-01-01,29705
1,1,Taylor Swift,2023-01-02,25766
2,2,Taylor Swift,2023-01-03,24682
3,3,Taylor Swift,2023-01-04,21214
4,4,Taylor Swift,2023-01-05,22873
...,...,...,...,...
356922,356922,Devi Sri Prasad,2023-12-27,588
356923,356923,Devi Sri Prasad,2023-12-28,636
356924,356924,Devi Sri Prasad,2023-12-29,706
356925,356925,Devi Sri Prasad,2023-12-30,686


### Fetching Artist Genres Using Requests
This segment queries Wikidata for the music genres of artists, incorporating error handling for robust data fetching.

In [None]:
import requests


artists_data = df["article"]
unique_artists = artists_data.unique()

def fetch_artist_genre(artist_names):
    """Fetches the first genre for all artist names in the DataFrame using direct HTTP requests to the Wikidata endpoint."""
    wikidata_endpoint = "https://query.wikidata.org/sparql"
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
        'Accept': 'application/sparql-results+json'
    }
    artist_genre_list = []  # This will hold the tuples of (artist_name, genre)

    for artist_name in artist_names:
        query = f"""
        SELECT ?artist ?genreLabel WHERE {{
          ?artist wdt:P136 ?genre;  # Genre
                  rdfs:label "{artist_name}"@en.
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        LIMIT 1
        """
        try:
            response = requests.get(wikidata_endpoint, headers=headers, params={'query': query, 'format': 'json'})
            response.raise_for_status()  # Raises a HTTPError for bad responses
            data = response.json()
            if data["results"]["bindings"]:
                genre = data["results"]["bindings"][0]["genreLabel"]["value"]
                artist_genre_list.append((artist_name, genre))
            else:
                artist_genre_list.append((artist_name, "Genre not found"))
        except requests.RequestException as e:
            print(f"Request failed for artist {artist_name}: {e}")
            artist_genre_list.append((artist_name, "Request failed"))

    return artist_genre_list

# Fetching genres for all artists
artist_genres_data = fetch_artist_genre(unique_artists)

# Creating a DataFrame from the list of tuples
artist_genres_df = pd.DataFrame(artist_genres_data, columns=['Artist', 'Genre'])
print(artist_genres_df)


              Artist             Genre
0       Taylor Swift         pop music
1       XXXTentacion         punk rock
2         Alia Bhatt   Genre not found
3        Tina Turner     rock and roll
4       Keanu Reeves     hip hop music
..               ...               ...
991  Heather Headley        soul music
992    Crowded House  alternative rock
993            Hwasa              jazz
994      CeCe Winans      gospel music
995  Devi Sri Prasad       filmi music

[996 rows x 2 columns]


### Handling Missing Data
Counts and handles missing genre information by replacing not found entries with a default label, enhancing data integrity.

In [None]:
genre_not_found_count = (artist_genres_df['Genre'] == 'Genre not found').sum()

print(f"Number of 'Genre not found' entries: {genre_not_found_count}")

Number of 'Genre not found' entries: 263


### Handling Missing Data
Counts and handles missing genre information by replacing not found entries with a default label, enhancing data integrity.

In [None]:
artist_genres_df['Genre'] = artist_genres_df['Genre'].replace('Genre not found', 'Unspecified')

# Displaying the updated DataFrame to verify the change
print(artist_genres_df)

              Artist             Genre
0       Taylor Swift         pop music
1       XXXTentacion         punk rock
2         Alia Bhatt       Unspecified
3        Tina Turner     rock and roll
4       Keanu Reeves     hip hop music
..               ...               ...
991  Heather Headley        soul music
992    Crowded House  alternative rock
993            Hwasa              jazz
994      CeCe Winans      gospel music
995  Devi Sri Prasad       filmi music

[996 rows x 2 columns]


### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
artist_genres_df.to_csv("artists_genres", index=False)

### Data Export and Verification
Exports the processed DataFrame to a CSV file and reloads it to verify the contents, ensuring data is saved correctly.

In [None]:
df = pd.read_csv("/content/artists_genres")
df

Unnamed: 0,Artist,Genre
0,Taylor Swift,pop music
1,XXXTentacion,punk rock
2,Alia Bhatt,Unspecified
3,Tina Turner,rock and roll
4,Keanu Reeves,hip hop music
...,...,...
991,Heather Headley,soul music
992,Crowded House,alternative rock
993,Hwasa,jazz
994,CeCe Winans,gospel music


### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
import pandas as pd


### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
top_visited = pd.read_excel("/content/top_visited_2023.xlsx")
split_data = top_visited['name,2023_visits'].str.split(',', expand=True)
if split_data.shape[1] == 2:  # Assicurati che ci siano esattamente 2 colonne dopo la divisione
    top_visited[['name', '2023views']] = split_data

top_visited.head()

Unnamed: 0,"name,2023_visits"
0,"Taylor Swift,22030018"
1,"XXXTentacion,20104479"
2,"Alia Bhatt,12669342"
3,"Tina Turner,11590403"
4,"Keanu Reeves,9916477"


### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
df = split_data.dropna(axis=1)
print(df.head())

              0         1
0  Taylor Swift  22030018
1  XXXTentacion  20104479
2    Alia Bhatt  12669342
3   Tina Turner  11590403
4  Keanu Reeves   9916477


### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
df = df.rename(columns={0 : "name",
                   1: "views2023"})

### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
import requests
import pandas as pd

# I tuoi parametri di base
domain = 'en.wikipedia'
access = 'all-access'
agent = 'user'
start_date = '202301'  # YYYYMMDDc
end_date = '202312'  # YYYYMMDD
headers = {
    'User-Agent': 'MyDataVisualizationProject/1.0 (slykku2318@gmail.com)'
}

# La lista degli articoli da interrogare
names = df["name"]

# Inizializza un DataFrame vuoto per raccogliere i dati
all_views_data = pd.DataFrame()

# Ciclo attraverso ogni nome per costruire l'URL e fare la richiesta
for article in names:
    url = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{domain}/{access}/{agent}/{article}/daily/{start_date}01/{end_date}31'

    response = requests.get(url, headers=headers)
    data = response.json()

    # Elabora i dati per ogni articolo, se disponibili
    if 'items' in data:
        views_data = [{'article': article, 'date': item['timestamp'], 'views': item['views']} for item in data['items']]
        df = pd.DataFrame(views_data)

        # Converti timestamp in data leggibile
        df['date'] = pd.to_datetime(df['date'], format='%Y%m%d%H')

        # Aggiungi i dati al DataFrame principale
        all_views_data = pd.concat([all_views_data, df], ignore_index=True)
    else:
        print(f"Data not found or error in request for article: {article}")

# Mostra i primi dati raccolti
print(all_views_data.head())


Data not found or error in request for article: AC/DC
Data not found or error in request for article: Michaela JaÃ© Rodriguez
Data not found or error in request for article: KÃ¤Ã¤rijÃ¤
        article       date  views
0  Taylor Swift 2023-01-01  29705
1  Taylor Swift 2023-01-02  25766
2  Taylor Swift 2023-01-03  24682
3  Taylor Swift 2023-01-04  21214
4  Taylor Swift 2023-01-05  22873


### Data Analysis
This function identifies the top 3 articles by views for each day and formats the results.

In [None]:
def top_3_artists(group):
    return group.sort_values('views', ascending=False).head(3)['article'].tolist()

# Raggruppa per 'date' e applica la funzione per ottenere i primi 3 artisti
top_artists_by_day = all_views_data.groupby('date').apply(top_3_artists).reset_index(name='top_artists')

# Espandi la lista dei top artisti in colonne separate
top_artists_by_day[['top_1', 'top_2', 'top_3']] = pd.DataFrame(top_artists_by_day['top_artists'].tolist(), index=top_artists_by_day.index)

# Mostra il risultato
print(top_artists_by_day[['date', 'top_1', 'top_2', 'top_3']])

          date            top_1                top_2                top_3
0   2023-01-01      David Byrne          Miley Cyrus  The Pointer Sisters
1   2023-01-02     Tanya Tucker  The Pointer Sisters        Tammy Wynette
2   2023-01-03  Whitney Houston        Tammy Wynette         XXXTentacion
3   2023-01-04     XXXTentacion      Whitney Houston        Tammy Wynette
4   2023-01-05     Shania Twain         XXXTentacion      Whitney Houston
..         ...              ...                  ...                  ...
360 2023-12-27     XXXTentacion          Salman Khan           Alia Bhatt
361 2023-12-28     XXXTentacion                 Cher        Queen Latifah
362 2023-12-29     XXXTentacion         Taylor Swift                 Cher
363 2023-12-30     XXXTentacion          Tina Turner         Taylor Swift
364 2023-12-31     XXXTentacion          Tina Turner         Taylor Swift

[365 rows x 4 columns]


### Output and Export
Here, the processed data is saved to CSV files and prepared for download, suitable for offline analysis or sharing.

In [None]:
top_artists_by_day[['date', 'top_1', 'top_2', 'top_3']].to_csv("daily_visits")

### Output and Export
Here, the processed data is saved to CSV files and prepared for download, suitable for offline analysis or sharing.

In [None]:
daily_views = top_artists_by_day[['date', 'top_1', 'top_2', 'top_3']]

### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
from google.colab import files
daily_views.to_csv('daily_views.csv', index=False)

### Output and Export
Here, the processed data is saved to CSV files and prepared for download, suitable for offline analysis or sharing.

In [None]:
files.download("daily_views.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### Output and Export
Here, the processed data is saved to CSV files and prepared for download, suitable for offline analysis or sharing.

In [None]:
top_artists_by_day.to_csv("top_visits_per_day")
all_views_data.to_csv("NUMBER_of_views")

### Additional Wikipedia Data Retrieval
This section uses the Wikipedia API to fetch introductory paragraphs for top articles, adding context to the analysis.

In [None]:
pip install wikipedia-api


Collecting wikipedia-api
  Downloading Wikipedia_API-0.6.0-py3-none-any.whl (14 kB)
Installing collected packages: wikipedia-api
Successfully installed wikipedia-api-0.6.0


### Data Import
This section imports necessary libraries and loads data from an Excel file.

In [None]:
import wikipediaapi
import pandas as pd

# Load your dataset
df = pd.read_csv("/content/drive/MyDrive/top_visits_per_day")



wiki_wiki = wikipediaapi.Wikipedia(
    language='en',
    user_agent='YourAppName/1.0 (your-email@example.com)'
)

def get_wikipedia_intro(page_title):
    page = wiki_wiki.page(page_title)
    if page.exists():
        return page.summary
    else:
        return "Page not found"

# Apply the function to each artist in the "top_1" column
df['Introduction'] = df['top_1'].apply(get_wikipedia_intro)

# Save the updated dataframe with introductions
df.to_csv("artists_introduction.csv", index=False)

# If you want to display the first few rows to verify
print(df.head())


   Unnamed: 0        date                                        top_artists  \
0           0  2023-01-01  ['David Byrne', 'Miley Cyrus', 'The Pointer Si...   
1           1  2023-01-02  ['Tanya Tucker', 'The Pointer Sisters', 'Tammy...   
2           2  2023-01-03  ['Whitney Houston', 'Tammy Wynette', 'XXXTenta...   
3           3  2023-01-04  ['XXXTentacion', 'Whitney Houston', 'Tammy Wyn...   
4           4  2023-01-05  ['Shania Twain', 'XXXTentacion', 'Whitney Hous...   

             top_1                top_2                top_3  \
0      David Byrne          Miley Cyrus  The Pointer Sisters   
1     Tanya Tucker  The Pointer Sisters        Tammy Wynette   
2  Whitney Houston        Tammy Wynette         XXXTentacion   
3     XXXTentacion      Whitney Houston        Tammy Wynette   
4     Shania Twain         XXXTentacion      Whitney Houston   

                                        Introduction  
0  David Byrne (; born 14 May 1952) is a Scottish...  
1  Tanya Denise Tucker (