<img src="Maslow's_Hierarchy_of_Needs.svg.png" height = 600px>

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import urllib.parse # combine URL components into URL string
import wikipediaapi # query wikipedia through api

from pytrends.request import TrendReq # Google Trends API

from statsmodels.tsa.seasonal import STL # seasonal decompositions
import statsmodels.tsa.stattools as smt


import pickle #  to serialize and deserialize objects in Python
from requests.exceptions import RequestException
import requests
from json.decoder import JSONDecodeError
from scipy import signal
import warnings
from urllib import request
from bs4 import BeautifulSoup
import json
import pytrends
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import wikipediaapi
import pprint as pp

## Create a dataframe of pageviews per topic from a .txt using Wikipedia API

In [176]:
def create_dataframe(name_file):
    """
    Creates a dataframe from a text file
    param: name_file: name of the text file
    return: dataframe with the text file
    """
    df = pd.read_csv(name_file, delimiter="\t", header=None, names=['Topics'])
    df.drop_duplicates(inplace=True)
    return df

# Take only starting from the second word in each row
def remove_space(df):
    return df['Topics'].apply(lambda x: x.strip().replace(' ', '_'))

# Parse the topics into the URL format
def parse_topics_into_df(df, lan, start_time, end_time):
    # change the spaces to underscores
    df['url'] = np.zeros(len(df))
    for index, row in df.iterrows():
        topic_value = row['Topics']
        df.loc[index, 'url'] = f'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{lan}.wikipedia.org/all-access/all-agents/{topic_value}/monthly/{start_time}/{end_time}'

    return df

# Create a new dataframe with timestamp from starting date to ending date
def create_dataframe_timestamp(starting_date, ending_date):
    df_timestamp = pd.DataFrame()
    df_timestamp['Timestamp'] = pd.date_range(start=starting_date, end=ending_date, freq='MS')
    return df_timestamp

# Define a function to fetch data from the URL and handle errors
def fetch_and_parse_url(url):
    try:
        request.urlopen(url).read()
        return True
    except request.HTTPError as e:
        if e.code == 404:
            return False  # or any other value or action you prefer for 404 errors
        else:
            return False  # or handle other HTTP errors as needed
    except Exception as e:
        
        return False  # or handle other exceptions as needed

def get_pageviews_wiki(url):
    """
    Gets the weekly pageviews for one Wikipedia page in one language in the desired period
    param: url: url of the Wikipedia page
    param: start_date: beginning of the desired period 
    param: end_date: end of the desired period 
    return: dataframe column with the monthly pageviews
    """
    html = request.urlopen(url).read()
    soup = BeautifulSoup(html,'html.parser')
    site_json=json.loads(soup.text)
    df=pd.DataFrame(site_json['items'])
    df=df['views']
    return df

def scrape_pageviews(df):
    pageview = pd.DataFrame()
    pageview['Timestamp'] = create_dataframe_timestamp('2019-01-01', '2020-07-31')['Timestamp']

    # Loop through the rows of the DataFrame and append the results of the function to the DataFrame
    for index, row in df.iterrows():
        url = row['url']
        if fetch_and_parse_url(url):
            pageview_solo = pd.DataFrame()
            pageview_solo[row['Topics']] = get_pageviews_wiki(url)
            pageview = pd.concat([pageview, pageview_solo], axis=1)

    return pageview

def scrape_pageviews_v2(df):
    pageview = pd.DataFrame()
    pageview['Timestamp'] = create_dataframe_timestamp('2019-01-01', '2020-07-31')['Timestamp']

    # Loop through the rows of the DataFrame and append the results of the function to the DataFrame
    for index, row in df.iterrows():
        url = row['url']
        if fetch_and_parse_url(url):
            pageview_solo = pd.DataFrame()
            pageview_solo[row['Topics']] = get_pageviews_wiki(url)
            pageview = pd.concat([pageview, pageview_solo], axis=1)
        else :
            pageview_solo = pd.DataFrame()
            pageview_solo[row['Topics']] = np.zeros(len(pageview))
            pageview = pd.concat([pageview, pageview_solo], axis=1)

    return pageview

def create_dataframe_pageviews(name_file, lan, start_time = '20190101', end_time = '20200731'):
    """
    Creates a dataframe from a text file
    param: name_file: name of the text file, start_time: beginning of the desired period, end_time: end of the desired period
    """
    df_topic = create_dataframe(name_file)
    df_topic['Topics'] = remove_space(df_topic)
    df_topic = parse_topics_into_df(df_topic, lan, start_time, end_time)
    df_pageviews = scrape_pageviews(df_topic)
    df_pageviews.fillna(0, inplace=True)
    return df_pageviews, df_topic

def create_dataframe_pageviews_v2(df_topic_lan, lan, start_time = '20190101', end_time = '20200731'):
    """
    Creates a dataframe from a text file
    param: name_file: name of the text file, start_time: beginning of the desired period, end_time: end of the desired period
    """
    df_topic = pd.DataFrame()
    df_topic['Topics'] = df_topic_lan[f'{lan}']
    df_topic['Topics'] = remove_space(df_topic)
    df_topic = parse_topics_into_df(df_topic, lan, start_time, end_time)
    df_pageviews = scrape_pageviews_v2(df_topic)
    df_pageviews.fillna(0, inplace=True)
    df_topic_lan[f'{lan}'] = df_topic['Topics']
    return df_pageviews, df_topic_lan


### Get translated topics found on Wikidata

In [170]:
def get_label_in_language(english_label, target_language):
    # Endpoint URL for the Wikidata Query Service
    url = "https://query.wikidata.org/sparql"
    
    # SPARQL query to get the item with the English label and its label in the target language
    query = f'''
    SELECT ?item ?itemLabel WHERE {{
      ?item rdfs:label "{english_label}"@en.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],{target_language}". }}
    }}
    LIMIT 1
    '''
    
    # Headers for the request
    headers = {
        'User-Agent': 'MyBot/0.1 (myemail@example.com)',
        'Accept': 'application/sparql-results+json'
    }
    
    # Make the GET request
    response = requests.get(url, headers=headers, params={'query': query, 'format': 'json'})
    
    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        results = data['results']['bindings']
        if results:
            # Return the item and its label in the target language
            return results[0]['itemLabel']['value']
        else:
            return '_' # No label found for this language
    else:
        # Handle unsuccessful requests
        response.raise_for_status()

def get_label_in_english(label, source_language):
    # Endpoint URL for the Wikidata Query Service
    url = "https://query.wikidata.org/sparql"

    # SPARQL query to get the item with the label in the source language and its English label
    query = f'''
    SELECT ?item ?itemLabel WHERE {{
      ?item rdfs:label "{label}"@{source_language}.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    LIMIT 1
    '''

    # Headers for the request
    headers = {
        'User-Agent': 'MyBot/0.1 (myemail@example.com)',
        'Accept': 'application/sparql-results+json'
    }

    # Make the GET request
    response = requests.get(url, headers=headers, params={'query': query, 'format': 'json'})

    # Check if the request was successful
    if response.status_code == 200:
        data = response.json()
        results = data['results']['bindings']
        if results:
            # Return the item and its English label
            return results[0]['itemLabel']['value']
        else:
            return '_'  # No label found for this language
    else:
        # Handle unsuccessful requests
        response.raise_for_status()
    
print(get_label_in_english('estetica', 'it'))

def change_Q(name):
    if name.startswith('Q'):
        return '_'
    else:
        return name

def capitalize_first_letter(value):
    string = str(value)
    return string[0].upper() + string[1:].lower() if string else ''

def translate_topics(df_topic, lang):
    # Run through all the topics and get the translation in Italien and store it in a new column in the DataFrame
    df_topic['Topics'] = df_topic['Topics'].str.lower()
    df_topic[f'{lang}'] = df_topic['Topics'].apply(lambda x: get_label_in_language(x, lang))
    df_topic[f'{lang}'] = df_topic[f'{lang}'].apply(lambda x: change_Q(x))
    df_topic[f'{lang}'] = df_topic[f'{lang}'].replace('_', np.nan)
    df_topics = pd.DataFrame()
    df_topics = df_topic[f'{lang}']
    df_topics = df_topics.dropna()
    df_topics = df_topics.apply(lambda x: capitalize_first_letter(x))
    return df_topics


aesthetics


### Get pageviews from wikidata for specific country (still not working)

In [None]:
def get_pageviews_for_country(language, project, country, topic, start_date, end_date):

    base_url = "https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article"
    
    # If a country is specified, include it in the endpoint
    if country:
        endpoint = f"{language}.{project}.org/all-access/all-agents/{topic}/"
    else:
        endpoint = f"{language}.{project}.org/all-access/all-agents"

    # Construct the API URL
    url = f"{base_url}/{endpoint}"

    headers = {
        'User-Agent': 'MyBot/0.1 (myemail@example.com)',
        'Accept': 'application/json'
    }

    # Set parameters
    params = {
        "start": start_date,
        "end": end_date,
        "access": "all-access",
        "agent": "all-agents",
        "granularity": "daily",
        "geotarget": country,
    }

    try:
        # Make the API request
        response = requests.get(url, params=params, headers=headers)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)

        # Check if the response is not empty
        if response.text:
            data = response.json()
            df = pd.DataFrame(data["items"])
            return df
        else:
            print("Empty response.")
            return pd.DataFrame()  # Return an empty DataFrame in case of an empty response

    except JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        return pd.DataFrame()  # Return an empty DataFrame in case of a JSONDecodeError
    except RequestException as e:
        print(f"Request Exception: {e}")
        print(f"URL: {url}")  # Print the URL for debugging
        return pd.DataFrame()  # Return an empty DataFrame in case of a RequestException


### Using google trends API

In [None]:
def get_Trends(topic, country):
    # Create a pytrends object
    pytrends = TrendReq(hl='en-US', tz=10)

    # Build payload with country geotarget
    kw_list = [topic]
    pytrends.build_payload(kw_list, cat=0, timeframe='2019-01-01 2020-07-31', geo=country, gprop='')

    # Get interest over time
    try:
        interest_over_time_df = pytrends.interest_over_time()
        return interest_over_time_df
    except Exception as e:
        print(f"An error occurred: {e}")
        return pd.DataFrame()

## Get a .csv file containg all languages for a specific topic

In [151]:
def get_gen_df_topics(name_file, csv_name):
    '''
    'it': 'Italian','cs': 'Czech','ro': 'Romanian','sv': 'Swedish','fi': 'Finnish','da': 'Danish','sq': 'Albanian'
    '''

    df_pageviews, df_topic = create_dataframe_pageviews(name_file, 'en', '20190101', '20200731')

    df_topic_1 = translate_topics(df_topic, 'it')
    df_topic_2 = translate_topics(df_topic, 'cs')
    df_topic_3 = translate_topics(df_topic, 'ro')
    df_topic_4 = translate_topics(df_topic, 'sv')
    df_topic_5 = translate_topics(df_topic, 'fi')
    df_topic_6 = translate_topics(df_topic, 'da')
    df_topic_7 = translate_topics(df_topic, 'sq')

    df = pd.concat([df_topic_1, df_topic_2, df_topic_3, df_topic_4, df_topic_5, df_topic_6, df_topic_7], axis=1, join='inner')

    df_topic_o = df_topic['Topics']
    df_gen = pd.concat([df_topic_o, df], axis=1, join='inner')
    df_gen = df_gen.reset_index(drop=True)

    return df_gen

In [192]:
def get_mean_pageviews(df):
    # Check if all values in each column are equal to 0
    df_time = df['Timestamp']
    df = df.iloc[:, 1:]
    all_zero_columns = (df == 0).all()

    # Find row-wise mean
    row_means = df[df != 0].mean(axis=1).round().astype(int)

    # Replace columns with all zeros by row-wise means
    for column in all_zero_columns[all_zero_columns].index:
        df[column] = row_means
    df.insert(0, 'Timestamp', df_time)
    return df

In [193]:
def get_gen_df_pageviews(df, name_file):
    lan = ['it', 'cs', 'ro', 'sv', 'fi', 'da', 'sq']
    for i in lan:
        df_i = df[i]
        df_i = df_i.to_frame()
        df_pageviews, df_topic = create_dataframe_pageviews_v2(df_i, i, '20190101', '20200731')
        df_pageviews = get_mean_pageviews(df_pageviews)
        df_pageviews.to_csv(f'Wiki-pageviews/{name_file}/{name_file}_pageviews_{i}.csv', index=False)
        print(f'{i} done and its pageviews shape is {df_pageviews.shape}')

print(df_gen.shape)
get_gen_df_pageviews(df_gen, 'self_actualization')

(54, 8)
it done and its pageviews shape is (19, 55)
cs done and its pageviews shape is (19, 55)
ro done and its pageviews shape is (19, 55)
sv done and its pageviews shape is (19, 55)
fi done and its pageviews shape is (19, 55)
da done and its pageviews shape is (19, 55)
sq done and its pageviews shape is (19, 55)


## Write a .txt file of topics for each country 

In [None]:
def write_topics_to_csv(name_file, csv_name):
    lan = ['it', 'el', 'hu', 'cs', 'bg', 'ro']
    df_gen = get_gen_df_topics(name_file, csv_name)
    for i in range(len(lan)):
        df = pd.DataFrame()
        df[lan[i]] = df_gen[lan[i]]
        df[lan[i]].to_csv(f'Wiki-pageviews/{csv_name}/{csv_name}_{lan[i]}.txt', index=False)

write_topics_to_csv('Wiki-pageviews/self_actualization.txt', 'self_actualization')

### List all valid topics

In [None]:
def list_valid_topics(df_pageviews):
    lista = df_pageviews.columns.tolist()
    lista = [s.replace('_', ' ').replace("'", "") for s in lista]
    return lista[1:]

# Add to .txt file this list
with open('Self_actualization/self_actualization1.txt', 'w') as f:
    for item in lista:
        f.write("%s\n" % item)

## Get pageviews for each topic in different languages

In [196]:
def get_pageviews_all_lang(name_file, csv_name):
    df_gen = get_gen_df_topics(name_file, csv_name)
    print(df_gen.shape)
    get_gen_df_pageviews(df_gen, csv_name)
    df_gen['Topics'] = df_gen['Topics'].apply(lambda x: x.replace('_', ' ').replace("'", "")).apply(lambda x: x[0].upper() + x[1:].lower() if x else '')
    df_gen.to_csv(f'Wiki-pageviews/{csv_name}/{csv_name}_topics.txt', index=False)

get_pageviews_all_lang('Wiki-pageviews/love_belonging.txt', 'love_belonging')