In [1]:
import numpy as np
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import plotly.express as px
import re
import tabula
import pymupdf



In [2]:
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def clean_text(value):
    value = re.sub('<.*?>', '', value)
    return value.strip()

def extract_row(tds, col_names):
    return {col: clean_text(str(tds[i])) for i, col in enumerate(col_names)}


def get_table(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'statTableHTML'})
        if table is None:
            raise ValueError("No se encontró la tabla en la página.")
        return table
    except requests.exceptions.RequestException as e:
        print(f"Error al realizar la solicitud: {e}")
        return None
    
def process_table(url, col_names):
    table = get_table(url)
    if table is None:
        return pd.DataFrame()

    trs = table.find_all('tr')
    results = []
    for tr in trs[1:]:  # Skip the header row
        tds = tr.find_all('td')
        if len(tds) == len(col_names):
            row_result = extract_row(tds, col_names)
            results.append(row_result)

    return pd.DataFrame(results)


In [3]:
url1 = 'https://www.statista.com/statistics/433871/daily-social-media-usage-worldwide'
df1 = process_table(url1, ['Año', 'Minutos'])
if not df1.empty:
    df1['Minutos'] = pd.to_numeric(df1['Minutos'], errors='coerce')
    df1 = df1.sort_values(by="Año")

    fig1 = px.bar(df1, x="Año", y="Minutos", title='Crecimiento anual de minutos empleados en redes sociales')
    fig1.update_layout(
        yaxis=dict(range=[50, df1["Minutos"].max() + 10], title="Minutos"),
        xaxis_title="Año",
        title=dict(x=0.5)
    )
    fig1.show()

In [4]:
url2 = 'https://www.statista.com/statistics/1201880/most-visited-websites-worldwide/'
df2 = process_table(url2, ['Página', 'Visitas'])
if not df2.empty:
    df2['Visitas'] = pd.to_numeric(df2['Visitas'], errors='coerce')
    df2 = df2.sort_values(by='Visitas', ascending=False).reset_index(drop=True)

    fig2 = px.pie(df2, values='Visitas', names='Página', title='Visitas (en miles de millones) en 2023')
    fig2.update_layout(title=dict(x=0.5))
    fig2.show()

In [5]:
url3 = 'https://www.statista.com/statistics/1294062/social-media-year-on-year-growth/'

df3 = process_table(url3, ['Página', 'Porcentaje'])
df3['Porcentaje'] = pd.to_numeric(df3['Porcentaje'].str.replace('%', ''), errors='coerce')

fig = px.bar(df3, x='Página', y='Porcentaje', title='Crecimiento de usuarios en 2024')
fig.update_layout(title=dict(x=0.5))
fig.show()

: 

In [None]:
pdf_path = 'SocialReport.pdf'
doc = pymupdf.open(pdf_path)

df = tabula.read_pdf(pdf_path, pages=8, stream=True, pandas_options={"header": None})[0]

df_1 = df.iloc[:, :9]
df_2 = df.iloc[:, 9:18]

df_1.columns = df_1.iloc[0]
df_1.drop(0, inplace=True)
df_1.rename(columns={col: "País" for col in df_1.columns if pd.isna(col)}, inplace=True)

df_2.columns = df_1.columns
df_2.drop(0, inplace=True)

df_vertical = pd.concat([df_1, df_2], ignore_index=True)
