In [1]:
import numpy as np
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import plotly.express as px
import re
import tabula
import fitz
import dash
from dash import dcc, html
from dash.dependencies import Input, Output



In [2]:
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def clean_text(value):
    value = re.sub('<.*?>', '', value)
    return value.strip()

def extract_row(tds, col_names):
    return {col: clean_text(str(tds[i])) for i, col in enumerate(col_names)}


def get_table(url):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        table = soup.find('table', {'id': 'statTableHTML'})
        if table is None:
            raise ValueError("No se encontró la tabla en la página.")
        return table
    except requests.exceptions.RequestException as e:
        print(f"Error al realizar la solicitud: {e}")
        return None
    
def process_table(url, col_names):
    table = get_table(url)
    if table is None:
        return pd.DataFrame()

    trs = table.find_all('tr')
    results = []
    for tr in trs[1:]:  # Skip the header row
        tds = tr.find_all('td')
        if len(tds) == len(col_names):
            row_result = extract_row(tds, col_names)
            results.append(row_result)

    return pd.DataFrame(results)


In [3]:
url1 = 'https://www.statista.com/statistics/433871/daily-social-media-usage-worldwide'
df1 = process_table(url1, ['Año', 'Minutos'])
if not df1.empty:
    df1['Minutos'] = pd.to_numeric(df1['Minutos'], errors='coerce')
    df1 = df1.sort_values(by="Año")

    fig1 = px.bar(df1, x="Año", y="Minutos", title='Crecimiento anual de minutos empleados en redes sociales')
    fig1.update_layout(
        yaxis=dict(range=[50, df1["Minutos"].max() + 10], title="Minutos"),
        xaxis_title="Año",
        title=dict(x=0.5)
    )
    fig1.show()

In [4]:
url2 = 'https://www.statista.com/statistics/1201880/most-visited-websites-worldwide/'
df2 = process_table(url2, ['Página', 'Visitas'])
if not df2.empty:
    df2['Visitas'] = pd.to_numeric(df2['Visitas'], errors='coerce')
    df2 = df2.sort_values(by='Visitas', ascending=False).reset_index(drop=True)

    fig2 = px.pie(df2, values='Visitas', names='Página', title='Visitas (en miles de millones) en 2023')
    fig2.update_layout(title=dict(x=0.5))
    fig2.show()

In [5]:
url3 = 'https://www.statista.com/statistics/1294062/social-media-year-on-year-growth/'

df3 = process_table(url3, ['Página', 'Porcentaje'])
df3['Porcentaje'] = pd.to_numeric(df3['Porcentaje'].str.replace('%', ''), errors='coerce')

fig = px.bar(df3, x='Página', y='Porcentaje', title='Crecimiento de usuarios en 2024')
fig.update_layout(title=dict(x=0.5))
fig.show()

In [None]:
pdf_path = 'SocialReport.pdf'
doc = fitz.open(pdf_path)

df = tabula.read_pdf(pdf_path, pages=8, stream=True, pandas_options={"header": None})[0]

df_1 = df.iloc[:, :9]
df_2 = df.iloc[:, 9:18]

df_1.columns = df_1.iloc[0]
df_1.drop(0, inplace=True)
df_1.rename(columns={col: "País" for col in df_1.columns if pd.isna(col)}, inplace=True)

df_2.columns = df_1.columns
df_2.drop(0, inplace=True)

df_vertical = pd.concat([df_1, df_2], ignore_index=True)

df_long = pd.melt(df_vertical, id_vars=["País"], var_name="Año", value_name="Horas")

df_long["Año"] = pd.to_datetime(df_long["Año"], format="%Y").dt.year
df_long["Horas"] = pd.to_numeric(df_long["Horas"].str.replace(":", "."), errors="coerce")

app = dash.Dash(__name__)
country_options = [{'label': country, 'value': country} for country in df_long["País"].dropna().unique()]

app.layout = html.Div([
    html.H1("Reporte de uso de redes sociales", style={'textAlign': 'center', 'color': 'white'}),
    
    dcc.Dropdown(
        id='country-dropdown',
        options=country_options,
        value='USA',
        style={'width': '50%'}
    ),
    
    dcc.Graph(id='line-chart')
])

@app.callback(
    Output('line-chart', 'figure'),
     [Input('country-dropdown', 'value')]
)
def update_line_chart(country_name):
    # Filtrar los datos para el país seleccionado
    country_data = df_long[df_long["País"] == country_name]
    
    fig = px.line(
        country_data,
        x="Año",
        y="Horas",
        color="País",
        title=f"Tiempo promedio en redes sociales por país: {country_name}",
        labels={"Horas": "Horas por día", "Año": "Año"},
        hover_data={"País": True, "Horas": True, "Año": True},
    )
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
# svg_image = doc[28].get_svg_image()
# output_path = "img.svg"

# with open(output_path, "w", encoding="utf-8") as f:
#     f.write(svg_image)
    
# cairosvg.svg2png(url=output_path, write_to="img.png", dpi=300, scale=2)

# image = Image.open("img.png")

# extracted_text = pytesseract.image_to_string(image, config='--psm 3')
# extracted_text = extracted_text.split("\n")

# extracted_text = [line for line in extracted_text if line]
# extracted_text


In [None]:
data = {
    "País": [
        "Argentina", "Australia", "Austria", "Belgium", "Brazil", "Canada", "China", "Colombia", "Denmark",
        "Egypt", "France", "Germany", "Ghana", "Hong Kong", "India", "Indonesia", "Ireland", "Italy",
        "Japan", "Kenya", "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Nigeria",
        "Philippines", "Poland", "Portugal", "Romania", "Russia", "Saudi Arabia", "Singapore", "South Africa",
        "South Korea", "Spain", "Sweden", "Switzerland", "Taiwan", "Thailand", "Turkey", "UAE", "UK", "USA", "Vietnam"
    ],
    "Using Social to Stay Up-to-Date": [49, 34, 38, 38, 55, 39, 35, 56, 40, 49, 31, 32, 40, 40, 38, 48, 45, 42,
                                        30, 40, 61, 49, 30, 38, 40, 38, 65, 53, 53, 45, 49, 51, 26, 39, 49, 33,
                                        33, 59, 62, 35, 40, 33, 33, 44],
    "Following Brands They Like": [45, 36, 30, 30, 51, 35, 37, 55, 26, 40, 25, 26, 46, 46, 38, 36, 37, 36,
                                    24, 51, 51, 53, 19, 30, 33, 26, 58, 48, 48, 47, 46, 57, 33, 49, 49, 26,
                                    26, 50, 52, 36, 40, 33, 34, 37],
    "Discovering Brands Through Comments": [38, 22, 18, 17, 36, 22, 24, 38, 19, 32, 13, 14, 22, 23, 23, 35,
                                             24, 18, 13, 22, 40, 36, 17, 16, 26, 26, 39, 29, 29, 28, 21, 23,
                                             18, 24, 26, 16, 19, 36, 28, 23, 25, 20, 19, 32],
    "Researching Products/Brands Using Social Networks": [61, 31, 32, 32, 64, 32, 27, 53, 32, 33, 26, 24, 74,
                                                           40, 45, 39, 30, 29, 30, 76, 64, 61, 25, 35, 47, 74,
                                                           67, 47, 47, 44, 43, 57, 22, 39, 39, 28, 33, 58, 58,
                                                           51, 57, 27, 27, 68],
    "Motivating Purchase Through 'Likes'": [25, 14, 11, 9, 20, 15, 27, 28, 9, 32, 11, 10, 23, 23, 22, 15, 15,
                                            14, 9, 32, 32, 24, 8, 15, 36, 36, 28, 22, 22, 20, 13, 28, 13, 19,
                                            15, 9, 33, 33, 23, 23, 14, 14, 25],
    "Motivating Purchase Through 'Buy' Button": [10, 5, 4, 4, 10, 6, 17, 12, 5, 19, 4, 4, 10, 10, 16, 7, 7,
                                                 5, 4, 0, 12, 11, 3, 3, 7, 15, 15, 8, 16, 13, 6, 15, 8, 6, 6,
                                                 5, 18, 14, 12, 12, 6, 6, 14]
}

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

df_long = df.melt(id_vars=["País"], var_name="Metrica", value_name="Porcentaje")

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Redes Sociales en Diferentes Países", style={'textAlign': 'center', 'color': 'white'}),
    
    dcc.Dropdown(
        id='country-dropdown',
        options=[{'label': country, 'value': country} for country in df["País"]],
        value='USA',
        style={'width': '50%'}
    ),
    
    dcc.Graph(id='pie-chart')
])

@app.callback(
    Output('pie-chart', 'figure'),
    Input('country-dropdown', 'value')
)
def update_pie_chart(country_name):
    country_data = df_long[df_long["País"] == country_name]
    fig = px.pie(country_data, names="Metrica", values="Porcentaje", title=f"Social Media Usage in {country_name}")
    return fig

if __name__ == '__main__':
    app.run_server(debug=True)