In [1]:
import numpy as np
import pandas as pd
import requests
import urllib.request
from bs4 import BeautifulSoup
import plotly.express as px
import re
import tabula
import fitz
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import pytesseract
from PIL import Image
import cairosvg



In [2]:
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def clean_text(value):
	value = re.sub('<.*?>', '', value)
	return value.strip()

def extract_row(tds, col_names):
	return {col: clean_text(str(tds[i])) for i, col in enumerate(col_names)}


def get_table(url):
	try:
		response = requests.get(url, headers=headers)
		response.raise_for_status()
		soup = BeautifulSoup(response.content, 'html.parser')
		table = soup.find('table', {'id': 'statTableHTML'})
		if table is None:
			raise ValueError("No se encontró la tabla en la página.")
		return table
	except requests.exceptions.RequestException as e:
		print(f"Error al realizar la solicitud: {e}")
		return None
	
def process_table(url, col_names):
	table = get_table(url)
	if table is None:
		return pd.DataFrame()

	trs = table.find_all('tr')
	results = []
	for tr in trs[1:]:  # Skip the header row
		tds = tr.find_all('td')
		if len(tds) == len(col_names):
			row_result = extract_row(tds, col_names)
			results.append(row_result)

	return pd.DataFrame(results)


In [3]:
url1 = 'https://www.statista.com/statistics/433871/daily-social-media-usage-worldwide'
df1 = process_table(url1, ['Año', 'Minutos'])
if not df1.empty:
	df1['Minutos'] = pd.to_numeric(df1['Minutos'], errors='coerce')
	df1 = df1.sort_values(by="Año")

	fig1 = px.bar(df1, x="Año", y="Minutos", title='Crecimiento anual de minutos empleados en redes sociales')
	fig1.update_layout(
		yaxis=dict(range=[50, df1["Minutos"].max() + 10], title="Minutos"),
		xaxis_title="Año",
		title=dict(x=0.5)
	)
	fig1.show()

In [4]:
url2 = 'https://www.statista.com/statistics/1201880/most-visited-websites-worldwide/'
df2 = process_table(url2, ['Página', 'Visitas'])
if not df2.empty:
	df2['Visitas'] = pd.to_numeric(df2['Visitas'], errors='coerce')
	df2 = df2.sort_values(by='Visitas', ascending=False).reset_index(drop=True)

	fig2 = px.pie(df2, values='Visitas', names='Página', title='Visitas (en miles de millones) en 2023')
	fig2.update_layout(title=dict(x=0.5))
	fig2.show()

In [5]:
url3 = 'https://www.statista.com/statistics/1294062/social-media-year-on-year-growth/'

df3 = process_table(url3, ['Página', 'Porcentaje'])
df3['Porcentaje'] = pd.to_numeric(df3['Porcentaje'].str.replace('%', ''), errors='coerce')

fig = px.bar(df3, x='Página', y='Porcentaje', title='Crecimiento de usuarios en 2024')
fig.update_layout(title=dict(x=0.5))
fig.show()

In [6]:
## Tiempo usado en redes sociales en cada país -> Página 8 del informe
pdf_path = 'SocialReport.pdf'
doc = fitz.open(pdf_path)

df_table = tabula.read_pdf(pdf_path, pages=8, stream=True, pandas_options={"header": None})[0]

df_1 = df_table.iloc[:, :9]
df_2 = df_table.iloc[:, 9:18]

df_1.columns = df_1.iloc[0]
df_1.drop(0, inplace=True)
df_1.rename(columns={col: "País" for col in df_1.columns if pd.isna(col)}, inplace=True)

df_2.columns = df_1.columns
df_2.drop(0, inplace=True)

df_vertical = pd.concat([df_1, df_2], ignore_index=True)

df_long_social = pd.melt(df_vertical, id_vars=["País"], var_name="Año", value_name="Horas")

df_long_social["Año"] = pd.to_datetime(df_long_social["Año"], format="%Y").dt.year
df_long_social["Horas"] = pd.to_numeric(df_long_social["Horas"].str.replace(":", "."), errors="coerce")

country_options = [{'label': country, 'value': country} for country in df_long_social["País"].dropna().unique()]

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
## Motivos de uso de redes sociales -> Página 30 del informe
data = {
	"País": [
		"Argentina", "Australia", "Austria", "Belgium", "Brazil", "Canada", "China", "Colombia", "Denmark",
		"Egypt", "France", "Germany", "Ghana", "Hong Kong", "India", "Indonesia", "Ireland", "Italy",
		"Japan", "Kenya", "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Nigeria",
		"Philippines", "Poland", "Portugal", "Romania", "Russia", "Saudi Arabia", "Singapore", "South Africa",
		"South Korea", "Spain", "Sweden", "Switzerland", "Taiwan", "Thailand", "Turkey", "UAE", "UK", "USA", "Vietnam"
	],
	"Using Social to Stay Up-to-Date": [49, 34, 38, 38, 55, 39, 35, 56, 40, 49, 31, 32, 40, 40, 38, 48, 45, 42,
										30, 40, 61, 49, 30, 38, 40, 38, 65, 53, 53, 45, 49, 51, 26, 39, 49, 33,
										33, 59, 62, 35, 40, 33, 33, 44],
	"Following Brands They Like": [45, 36, 30, 30, 51, 35, 37, 55, 26, 40, 25, 26, 46, 46, 38, 36, 37, 36,
									24, 51, 51, 53, 19, 30, 33, 26, 58, 48, 48, 47, 46, 57, 33, 49, 49, 26,
									26, 50, 52, 36, 40, 33, 34, 37],
	"Discovering Brands Through Comments": [38, 22, 18, 17, 36, 22, 24, 38, 19, 32, 13, 14, 22, 23, 23, 35,
											 24, 18, 13, 22, 40, 36, 17, 16, 26, 26, 39, 29, 29, 28, 21, 23,
											 18, 24, 26, 16, 19, 36, 28, 23, 25, 20, 19, 32],
	"Researching Products/Brands Using Social Networks": [61, 31, 32, 32, 64, 32, 27, 53, 32, 33, 26, 24, 74,
														   40, 45, 39, 30, 29, 30, 76, 64, 61, 25, 35, 47, 74,
														   67, 47, 47, 44, 43, 57, 22, 39, 39, 28, 33, 58, 58,
														   51, 57, 27, 27, 68],
	"Motivating Purchase Through 'Likes'": [25, 14, 11, 9, 20, 15, 27, 28, 9, 32, 11, 10, 23, 23, 22, 15, 15,
											14, 9, 32, 32, 24, 8, 15, 36, 36, 28, 22, 22, 20, 13, 28, 13, 19,
											15, 9, 33, 33, 23, 23, 14, 14, 25],
	"Motivating Purchase Through 'Buy' Button": [10, 5, 4, 4, 10, 6, 17, 12, 5, 19, 4, 4, 10, 10, 16, 7, 7,
												 5, 4, 0, 12, 11, 3, 3, 7, 15, 15, 8, 16, 13, 6, 15, 8, 6, 6,
												 5, 18, 14, 12, 12, 6, 6, 14]
}

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

df_long_use = df.melt(id_vars=["País"], var_name="Metrica", value_name="Porcentaje")

In [8]:
## Porcentaje de uso de redes sociales más famosas en cada país -> Página 18 del informe
countries = [
    "Australia", "Hong Kong", "India", "Indonesia", "Japan", "Malaysia", "New Zealand",
    "Philippines", "Singapore", "South Korea", "Taiwan", "Thailand", "Vietnam", "Austria",
    "Belgium", "Denmark", "France", "Germany", "Ireland", "Italy", "Netherlands", "Poland",
    "Portugal", "Romania", "Russia", "Spain", "Sweden", "Switzerland", "Turkey", "UK",
    "Argentina", "Brazil", "Colombia", "Mexico", "Egypt", "Ghana", "Kenya", "Morocco",
    "Nigeria", "Saudi Arabia", "South Africa", "UAE", "Canada", "USA"
]

data = {
    "País": countries,
    "Facebook": [80, 84, 82, 83, 34, 91, 84, 95, 80, 61, 88, 92, 93, 72, 80, 81, 74, 63, 79, 81, 73, 86, 89, 90, 39, 81, 81, 70, 82, 77, 89, 90, 92, 94, 91, 73, 79, 78, 80, 64, 83, 80, 81, 75],
    "FB Messenger": [65, 51, 57, 48, 11, 65, 72, 86, 51, 26, 57, 71, 75, 50, 65, 66, 54, 40, 64, 56, 49, 67, 74, 73, 14, 47, 67, 49, 58, 61, 61, 67, 71, 77, 70, 50, 46, 41, 58, 41, 61, 58, 62, 55],
    "Instagram": [49, 58, 68, 78, 33, 72, 48, 59, 61, 50, 54, 61, 51, 46, 46, 50, 39, 37, 54, 59, 46, 51, 66, 55, 51, 60, 65, 51, 85,49, 71, 75, 73, 66, 66, 48, 55, 44, 61, 62, 58, 61, 49, 50],
    "Snapchat": [28, 17, 32, 24, 2, 22, 32, 26, 20, 7, 8, 18, 17, 19, 26, 38, 32, 16, 34, 14, 23, 26, 21, 24, 7, 17, 37, 24, 32, 27, 20, 24, 28, 30, 27, 28, 17, 25, 24, 47, 25, 32, 27, 30],
    "Twitter": [27, 27, 52, 51, 49, 45, 25, 51, 36, 29, 27, 49, 37, 19, 26, 23, 29, 20, 42, 33, 26, 32, 35, 35, 19, 51, 28, 23, 60, 45, 47, 47, 54, 58, 50, 29, 40, 17, 35, 58, 45, 47, 37, 39],
    "WhatsApp": [29, 81, 75, 86, 2, 92, 29, 28, 85, 7, 19, 25, 23, 81, 59, 19, 35, 78, 68, 83, 85, 40, 67, 71, 67, 86, 29, 82, 85, 60, 90, 88, 90, 89, 78, 83, 87, 79, 86, 73, 91, 76, 27, 28]
}

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

df_long_sm = df.melt(id_vars=["País"], var_name="Red", value_name="Porcentaje")

In [9]:
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Análisis de Uso de Redes Sociales", style={'textAlign': 'center', 'color': 'white'}),

    dcc.Tabs([
        dcc.Tab(label='Tiempo usado en redes sociales', children=[
            html.H3("Seleccione un país:", style={'color': 'white'}),
            dcc.Dropdown(
                id='dropdown-line',
                options=country_options,
                value='USA',
                style={'width': '50%'}
            ),
            dcc.Graph(id='line-chart')
        ]),

        dcc.Tab(label='Porcentaje de uso de cada red', children=[
            html.H3("Seleccione un país:", style={'color': 'white'}),
            dcc.Dropdown(
                id='dropdown-pie-sm',
                options=country_options,
                value='USA',
                style={'width': '50%'}
            ),
            dcc.Graph(id='pie-chart-sm')
        ]),

        dcc.Tab(label='Motivo de uso de las redes', children=[
            html.H3("Seleccione un país:", style={'color': 'white'}),
            dcc.Dropdown(
                id='dropdown-pie',
                options=country_options,
                value='USA',
                style={'width': '50%'}
            ),
            dcc.Graph(id='pie-chart')
        ])
    ])
])

@app.callback(
    Output('line-chart', 'figure'),
    Input('dropdown-line', 'value')
)
def update_line_chart(country_name):
    country_data = df_long_social[df_long_social["País"] == country_name]

    fig = px.line(
        country_data,
        x="Año",
        y="Horas",
        color="País",
        title=f"Tiempo promedio en redes sociales por país: {country_name}",
        labels={"Horas": "Horas por día", "Año": "Año"},
    )

    # Mejora del hover
    fig.update_traces(
        hovertemplate="<b>Año:</b> %{x}<br>" +
                      "<b>Horas por día:</b> %{y:.2f}<br>" +
                      "<b>País:</b> %{customdata[0]}",
        customdata=country_data[["País"]]
    )

    return fig

@app.callback(
    Output('pie-chart-sm', 'figure'),
    Input('dropdown-pie-sm', 'value')
)
def update_pie_chart_sm(country_name):
    country_data = df_long_sm[df_long_sm["País"] == country_name]

    fig = px.pie(
        country_data,
        values="Porcentaje",
        names="Red",
        title=f"Distribución de uso de redes sociales en {country_name}",
        labels={"Porcentaje": "Porcentaje", "Red": "Red Social"},
    )

    fig.update_traces(
        hovertemplate="<b>Red Social:</b> %{label}<br>" +
                      "<b>Porcentaje de uso:</b> %{value:.1f}%<br>" +
                      "<b>País:</b> " + country_name
    )

    return fig

@app.callback(
    Output('pie-chart', 'figure'),
    Input('dropdown-pie', 'value')
)
def update_pie_chart(country_name):
    country_data = df_long_use[df_long_use["País"] == country_name]

    fig = px.pie(
        country_data,
        values="Porcentaje",
        names="Metrica",
        title=f"Motivos de uso de redes sociales en {country_name}",
        labels={"Porcentaje": "Porcentaje", "Metrica": "Motivo"},
    )

    fig.update_traces(
        hovertemplate="<b>Motivo:</b> %{label}<br>" +
                      "<b>Porcentaje:</b> %{value:.1f}%<br>" +
                      "<b>País:</b> " + country_name
    )

    return fig

app.run_server(mode='inline', debug=False)

In [10]:
# https://github.com/Kozea/CairoSVG/issues/385
# image_path = "img.png"
# image = Image.open(image_path)

# paises = [
# 	"Argentina", "Australia", "Austria", "Belgium", "Brazil", "Canada", "China", "Colombia", "Denmark",
# 	"Egypt", "France", "Germany", "Ghana", "Hong Kong", "India", "Indonesia", "Ireland", "Italy",
# 	"Japan", "Kenya", "Malaysia", "Mexico", "Morocco", "Netherlands", "New Zealand", "Nigeria",
# 	"Philippines", "Poland", "Portugal", "Romania", "Russia", "Saudi Arabia", "Singapore", "South Africa",
# 	"South Korea", "Spain", "Sweden", "Switzerland", "Taiwan", "Thailand", "Turkey", "UAE", "UK", "USA", "Vietnam"
# ]

# cabeceras = [
#     "Using Social to Stay Up-to-Date",
#     "Following"
# ]

# extracted_text = pytesseract.image_to_string(image, config='--psm 3')
# extracted_text