In [None]:
!pip install pandas
import requests
import pandas as pd

def get_most_viewed_articles(month, year):
    endpoint_url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/top/{project}/{access}/{year}/{month}/all-days'
    url = endpoint_url.format(project='pt.wikipedia', access='all-access', year=year, month=month)
    headers = {
        'User-Agent': 'Mozilla/5.0'  # Updated User-Agent header
    }
    try:
        response = requests.get(url, headers=headers, timeout=None)
        response.raise_for_status()  # Raise an exception for HTTP errors
        data = response.json()
        articles = [article['article'] for article in data['items'][0]['articles']]
        return articles
    except requests.exceptions.RequestException as e:
        print('Error fetching data:', e)
        return None

def get_pageviews_dataframe(start_date, end_date):
    headers = {
        'User-Agent': 'Mozilla/5.0'  # Updated User-Agent header
    }
    base_url = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'
    project = 'pt.wikipedia'
    access = 'all-access'
    agent = 'user'
    granularity = 'daily'
    start = start_date.strftime('%Y%m%d')
    end = end_date.strftime('%Y%m%d')
    month_number = str(start_date.month)
    if len(month_number)==1:
      article_names = get_most_viewed_articles('0'+month_number, start_date.year)
    else:
      article_names = get_most_viewed_articles(start_date.month, start_date.year)

    # article_names = get_most_viewed_articles(start_date.month, start_date.year)

    if article_names is None:
        return None

    dataframe = pd.DataFrame(index=article_names, columns=pd.date_range(start=start_date, end=end_date))

    for article in article_names:
        url = base_url.format(project=project, access=access, agent=agent, article=article, granularity=granularity, start=start, end=end)
        try:
            response = requests.get(url,headers=headers)
            response.raise_for_status()
            data = response.json()
            for item in data['items']:
                timestamp = pd.to_datetime(item['timestamp'], format='%Y%m%d%H')
                dataframe.loc[article, timestamp] = item['views']
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data for {article}: {e}")

    return dataframe

start_date = pd.to_datetime('2024-01-01')
end_date = pd.to_datetime('2024-02-29')
pageviews_df = get_pageviews_dataframe(start_date, end_date)
if pageviews_df is not None:
    pageviews_df.head()




In [None]:
pageviews_df.head()

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05,2024-01-06,2024-01-07,2024-01-08,2024-01-09,2024-01-10,...,2024-02-20,2024-02-21,2024-02-22,2024-02-23,2024-02-24,2024-02-25,2024-02-26,2024-02-27,2024-02-28,2024-02-29
Wikipédia:Página_principal,69211.0,84016.0,85464,84209,82652,77421,74305,84837,82517,82290,...,82760,82270,81758,78220,71353,76640,83603,81969,81530,84860
Especial:Pesquisar,26545.0,35290.0,37046,36315,34743,31555,33332,37440,40061,39738,...,43174,42943,43026,38479,33231,38233,44879,44058,46430,44403
XXx,2075.0,6507.0,23504,19273,15121,21149,35667,34267,55477,62213,...,249,249,250,295,264,312,339,312,294,249
Fotos_dos_Mamonas_Assassinas_mortos,,,12913,15280,11216,46426,32110,26610,17609,14837,...,4998,4538,4762,5351,7864,5264,3340,2732,2665,3136
Voo_Força_Aérea_Uruguaia_571,195.0,284.0,241,6792,17331,28388,39496,23439,16217,14196,...,1256,1173,1088,1009,1678,2293,1235,849,756,614


In [None]:
import pandas as pd
def most_viewed_ptwiki_jan():
    # Get the sum of pageviews for each article in January
    january_views = pageviews_df.loc[:, '2024-01-01':'2024-01-31'].sum(axis=1)
    # Sort the articles based on their total pageviews
    sorted_articles = january_views.sort_values(ascending=False)
    return sorted_articles.index.tolist()

def most_viewed_ptwiki_jan_feb_per_day():
    # Get the sum of pageviews for each article for January and February
    jan_feb_views = pageviews_df.loc[:, '2024-01-01':'2024-02-29'].sum(axis=1)
    # Create a DataFrame with the sum of pageviews for each article for each day
    jan_feb_df = pageviews_df.loc[:, '2024-01-01':'2024-02-29']
    # Sort the columns in chronological order
    jan_feb_df = jan_feb_df.reindex(sorted(jan_feb_df.columns), axis=1)
    return jan_feb_df

# Execute the function to get the list of top viewed articles in January
top_viewed_list = most_viewed_ptwiki_jan()
print("Top Viewed Articles in January:")
print(top_viewed_list)

# Execute the function to get the dataframe of top viewed articles for January and February per day
top_viewed_dataframe = most_viewed_ptwiki_jan_feb_per_day()
print("\nTop Viewed Articles for January and February per Day:")
top_viewed_dataframe.head()

Top Viewed Articles in January:
['Wikipédia:Página_principal', 'Especial:Pesquisar', 'XXx', 'Fotos_dos_Mamonas_Assassinas_mortos', 'Voo_Força_Aérea_Uruguaia_571', 'Facebook', 'Zagallo', 'Porno_Graffitti', 'Renascer', 'ChatGPT', 'Ficheiro:Logotipo_da_GloboNews.png', 'Yasmin_Brunet', 'Cleópatra', 'Griselda_Blanco', 'AMBEV', 'Renascer_(2024)', 'YouTube', 'Copa_São_Paulo_de_Futebol_Júnior', 'Napoleão_Bonaparte', 'Sony_Channel', 'Rodriguinho_(cantor)', 'Brasil', 'Twitter', 'Ano-novo', 'João_Carreiro_&_Capataz', 'TV_Globo', 'Canal_Brasil', 'Jeffrey_Epstein', 'Domingos_Brazão', 'Cristiano_Ronaldo', 'Instagram', 'Mamonas_Assassinas', 'Louis_Joseph_César_Ducornet', 'Big_Brother_Brasil_24', 'Predefinição:Tabela_do_Campeonato_Brasileiro_da_Série_A_-_2024', 'Campeonato_Africano_das_Nações', 'Copa_São_Paulo_de_Futebol_Júnior_de_2024', 'Franz_Beckenbauer', 'Carlos_Alberto_Parreira', 'Dorival_Júnior', 'Thiago_Carpini', 'Marcinho_VP', 'Robert_Oppenheimer', 'Fernando_Parrado', 'Wanessa_Camargo', 'Vanes

Unnamed: 0,2024-01-01,2024-01-02,2024-01-03,2024-01-04,2024-01-05,2024-01-06,2024-01-07,2024-01-08,2024-01-09,2024-01-10,...,2024-02-20,2024-02-21,2024-02-22,2024-02-23,2024-02-24,2024-02-25,2024-02-26,2024-02-27,2024-02-28,2024-02-29
Wikipédia:Página_principal,69211.0,84016.0,85464,84209,82652,77421,74305,84837,82517,82290,...,82760,82270,81758,78220,71353,76640,83603,81969,81530,84860
Especial:Pesquisar,26545.0,35290.0,37046,36315,34743,31555,33332,37440,40061,39738,...,43174,42943,43026,38479,33231,38233,44879,44058,46430,44403
XXx,2075.0,6507.0,23504,19273,15121,21149,35667,34267,55477,62213,...,249,249,250,295,264,312,339,312,294,249
Fotos_dos_Mamonas_Assassinas_mortos,,,12913,15280,11216,46426,32110,26610,17609,14837,...,4998,4538,4762,5351,7864,5264,3340,2732,2665,3136
Voo_Força_Aérea_Uruguaia_571,195.0,284.0,241,6792,17331,28388,39496,23439,16217,14196,...,1256,1173,1088,1009,1678,2293,1235,849,756,614


In [None]:
!pip install bar_chart_race



In [None]:
print("top_viewed_dataframe" in globals())

True


In [None]:
!pip install bar_chart_race
import pandas as pd
import bar_chart_race as bcr

def dataframe_to_race_chart(df, filename):
    # Check if filename has ".mp4" extension, if not, add it
    if not filename.endswith(".mp4"):
        filename += ".mp4"

    # Convert columns to numeric dtype
    df_numeric = df.apply(pd.to_numeric)

    # Create the bar chart race
    bcr.bar_chart_race(df_numeric, filename=filename)

# Example usage
dataframe_to_race_chart(top_viewed_dataframe, "my_race_chart")



  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))


In [None]:
import os

# Add ffmpeg directory to PATH
ffmpeg_path = "./ffmpeg-6.1-amd64-static"  # Path to the ffmpeg directory
os.environ['PATH'] += os.pathsep + ffmpeg_path

# Now ffmpeg should be accessible from your Python script

In [None]:
# Download a static FFmpeg build and add it to PATH.
%run 'load-ffmpeg.ipynb'
print('Done!')

/usr/bin/ffmpeg
Done!


In [None]:
import pandas as pd
import bar_chart_race as bcr

def dataframe_to_race_chart(df):
    if df.empty:
        print("DataFrame is empty. Cannot create race chart.")
        return

    # Convert columns to numeric dtype
    df_numeric = df.apply(pd.to_numeric, errors='coerce')
    df_numeric.dropna(inplace=True)

    # Replace deprecated fillna method
    df_numeric.iloc[:, 0] = df_numeric.iloc[:, 0].ffill()

    filename = 'infograph.mp4'

    bcr.bar_chart_race(
        df=df_numeric,
        filename=filename,
        orientation='h',
        sort='desc',
        title='Top Viewed Articles in Portuguese Wikipedia',
        n_bars=10,
        fixed_order=False,
        steps_per_period=10,
        interpolate_period=False,
        period_length=500,
        figsize=(8, 5),
        cmap='dark12',
        title_size=14,
        bar_label_size=10,
        tick_label_size=8,
        shared_fontdict=None,
        scale='linear',
        writer=None,
        fig=None,
        bar_kwargs={'alpha': 0.7},
        filter_column_colors=False
    )

    print(f"Race chart generated: {filename}")

dataframe_to_race_chart(top_viewed_dataframe)


  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  fig.canvas.print_figure(io.BytesIO())
  ax.set_yticklabels(self.df_values.columns)
  ax.set_xticklabels([max_val] * len(ax.get_xticks()))


Race chart generated: infograph.mp4
