In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import re
import tabula
import fitz
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import os
import plotly.graph_objects as go

In [2]:
headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

def clean_text(value):
	value = re.sub('<.*?>', '', value)
	return value.strip()

def extract_row(tds, col_names):
	return {col: clean_text(str(tds[i])) for i, col in enumerate(col_names)}


def get_table(url):
	try:
		response = requests.get(url, headers=headers)
		response.raise_for_status()
		soup = BeautifulSoup(response.content, 'html.parser')
		table = soup.find('table', {'id': 'statTableHTML'})
		if table is None:
			raise ValueError("No se encontró la tabla en la página.")
		return table
	except requests.exceptions.RequestException as e:
		print(f"Error al realizar la solicitud: {e}")
		return None
	
def process_table(url = None, table = None, col_names = None):
	if table is None:
		table = get_table(url)
		if table is None:
			return pd.DataFrame()

	trs = table.find_all('tr')
	results = []
	for tr in trs[1:]:  # Skip the header row
		tds = tr.find_all('td')
		if len(tds) == len(col_names):
			row_result = extract_row(tds, col_names)
			results.append(row_result)

	return pd.DataFrame(results)

dataset_path = os.path.join(os.getcwd(), 'datasets')
input_datasets = os.path.join(dataset_path, 'input')
output_datasets = os.path.join(dataset_path, 'output')

In [3]:
url1 = 'https://www.statista.com/statistics/433871/daily-social-media-usage-worldwide'
df1 = process_table(url=url1, col_names=['Año', 'Minutos'])
df1.to_csv(os.path.join(output_datasets, 'sm_mins_growth.csv'), index=False)

if not df1.empty:
	df1['Minutos'] = pd.to_numeric(df1['Minutos'], errors='coerce')
	df1 = df1.sort_values(by="Año")

	fig1 = px.bar(df1, x="Año", y="Minutos", title='Crecimiento anual de minutos empleados en redes sociales')
	fig1.update_layout(
		yaxis=dict(range=[50, df1["Minutos"].max() + 10], title="Minutos"),
		xaxis_title="Año",
		title=dict(x=0.5)
	)
	fig1.show()

In [4]:
url2 = 'https://www.statista.com/statistics/1201880/most-visited-websites-worldwide/'
df2 = process_table(url=url2, col_names=['Página', 'Visitas'])
df2.to_csv(os.path.join(output_datasets, 'most_visited_websites.csv'), index=False)

if not df2.empty:
	df2['Visitas'] = pd.to_numeric(df2['Visitas'], errors='coerce')
	df2 = df2.sort_values(by='Visitas', ascending=False).reset_index(drop=True)

	fig2 = px.pie(df2, values='Visitas', names='Página', title='Visitas (en miles de millones) en 2023')
	fig2.update_layout(title=dict(x=0.5))
	fig2.show()

In [5]:
url3 = 'https://prioridata.com/data/social-media-usage/'
response = requests.get(url3)
response.raise_for_status()

soup = BeautifulSoup(response.content, 'html.parser')
df3 = process_table(col_names=['Año', 'Usuarios'], table=soup.find('table'))
df3['Usuarios'] = df3['Usuarios'].str.replace(' billion', '').astype(float)
df3["Año"] = df3["Año"].str.replace('(estimated)', '').astype(int)
df3.to_csv(os.path.join(output_datasets, 'sm_users_growth.csv'), index=False)

fig3 = px.line(df3, x='Usuarios', y='Año', orientation='h', title='Crecimiento anual de usuarios de redes sociales y estimacón.')
fig3.update_layout(title=dict(x=0.5))
fig3.show()

In [6]:
## Tiempo usado en redes sociales en cada país -> Página 8 del informe
pdf = os.path.join(input_datasets, 'SocialReport.pdf')
doc = fitz.open(pdf)

df_table = tabula.read_pdf(pdf, pages=8, stream=True, pandas_options={"header": None})[0]

df_1 = df_table.iloc[:, :9]
df_2 = df_table.iloc[:, 9:18]

df_1.columns = df_1.iloc[0]
df_1.drop(0, inplace=True)
df_1.rename(columns={col: "País" for col in df_1.columns if pd.isna(col)}, inplace=True)

df_2.columns = df_1.columns
df_2.drop(0, inplace=True)

df_vertical = pd.concat([df_1, df_2], ignore_index=True)

df_long_social = pd.melt(df_vertical, id_vars=["País"], var_name="Año", value_name="Horas")

df_long_social["Año"] = pd.to_datetime(df_long_social["Año"], format="%Y").dt.year
df_long_social["Horas"] = pd.to_numeric(df_long_social["Horas"].str.replace(":", "."), errors="coerce")

country_options = [{'label': country, 'value': country} for country in df_long_social["País"].dropna().unique()]

Failed to import jpype dependencies. Fallback to subprocess.
No module named 'jpype'


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [7]:
data = {
	"País": [
		"Argentina", "Australia", "Austria", "Bélgica", "Brasil", "Canadá", "China", "Colombia", "Dinamarca",
		"Egipto", "Francia", "Alemania", "Ghana", "Hong Kong", "India", "Indonesia", "Irlanda", "Italia",
		"Japón", "Kenia", "Malasia", "México", "Marruecos", "Países Bajos", "Nueva Zelanda", "Nigeria",
		"Filipinas", "Polonia", "Portugal", "Rumanía", "Rusia", "Arabia Saudita", "Singapur", "Sudáfrica",
		"Corea del Sur", "España", "Suecia", "Suiza", "Taiwán", "Tailandia", "Turquía", "EAU", "Reino Unido", "EE.UU.", "Vietnam"
	],
	"Uso de redes sociales para mantenerse informado": [49, 34, 38, 38, 55, 39, 35, 56, 40, 49, 31, 32, 40, 40, 38, 48, 45, 42,
														30, 40, 61, 49, 30, 38, 40, 38, 65, 53, 53, 45, 49, 51, 26, 39, 49, 33,
														33, 59, 62, 35, 40, 33, 33, 44],
	"Seguir marcas que les gustan": [45, 36, 30, 30, 51, 35, 37, 55, 26, 40, 25, 26, 46, 46, 38, 36, 37, 36,
									  24, 51, 51, 53, 19, 30, 33, 26, 58, 48, 48, 47, 46, 57, 33, 49, 49, 26,
									  26, 50, 52, 36, 40, 33, 34, 37],
	"Descubrir marcas a través de comentarios": [38, 22, 18, 17, 36, 22, 24, 38, 19, 32, 13, 14, 22, 23, 23, 35,
												 24, 18, 13, 22, 40, 36, 17, 16, 26, 26, 39, 29, 29, 28, 21, 23,
												 18, 24, 26, 16, 19, 36, 28, 23, 25, 20, 19, 32],
	"Investigar productos/marcas usando redes sociales": [61, 31, 32, 32, 64, 32, 27, 53, 32, 33, 26, 24, 74,
														   40, 45, 39, 30, 29, 30, 76, 64, 61, 25, 35, 47, 74,
														   67, 47, 47, 44, 43, 57, 22, 39, 39, 28, 33, 58, 58,
														   51, 57, 27, 27, 68],
	"Motivar compras a través de 'Me gusta'": [25, 14, 11, 9, 20, 15, 27, 28, 9, 32, 11, 10, 23, 23, 22, 15, 15,
												14, 9, 32, 32, 24, 8, 15, 36, 36, 28, 22, 22, 20, 13, 28, 13, 19,
												15, 9, 33, 33, 23, 23, 14, 14, 25],
	"Motivar compras a través del botón 'Comprar'": [10, 5, 4, 4, 10, 6, 17, 12, 5, 19, 4, 4, 10, 10, 16, 7, 7,
													   5, 4, 0, 12, 11, 3, 3, 7, 15, 15, 8, 16, 13, 6, 15, 8, 6, 6,
													   5, 18, 14, 12, 12, 6, 6, 14]
}

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

df_long_use = df.melt(id_vars=["País"], var_name="Metrica", value_name="Porcentaje")


In [8]:
## Porcentaje de uso de redes sociales más famosas en cada país -> Página 18 del informe
countries = [
	"Australia", "Hong Kong", "India", "Indonesia", "Japón", "Malasia", "Nueva Zelanda",
	"Filipinas", "Singapur", "Corea del Sur", "Taiwán", "Tailandia", "Vietnam", "Austria",
	"Bélgica", "Dinamarca", "Francia", "Alemania", "Irlanda", "Italia", "Países Bajos", "Polonia",
	"Portugal", "Rumanía", "Rusia", "España", "Suecia", "Suiza", "Turquía", "Reino Unido",
	"Argentina", "Brasil", "Colombia", "México", "Egipto", "Ghana", "Kenia", "Marruecos",
	"Nigeria", "Arabia Saudita", "Sudáfrica", "EAU", "Canadá", "EE.UU."
]

data = {
	"País": countries,
	"Facebook": [80, 84, 82, 83, 34, 91, 84, 95, 80, 61, 88, 92, 93, 72, 80, 81, 74, 63, 79, 81, 73, 86, 89, 90, 39, 81, 81, 70, 82, 77, 89, 90, 92, 94, 91, 73, 79, 78, 80, 64, 83, 80, 81, 75],
	"FB Messenger": [65, 51, 57, 48, 11, 65, 72, 86, 51, 26, 57, 71, 75, 50, 65, 66, 54, 40, 64, 56, 49, 67, 74, 73, 14, 47, 67, 49, 58, 61, 61, 67, 71, 77, 70, 50, 46, 41, 58, 41, 61, 58, 62, 55],
	"Instagram": [49, 58, 68, 78, 33, 72, 48, 59, 61, 50, 54, 61, 51, 46, 46, 50, 39, 37, 54, 59, 46, 51, 66, 55, 51, 60, 65, 51, 85,49, 71, 75, 73, 66, 66, 48, 55, 44, 61, 62, 58, 61, 49, 50],
	"Snapchat": [28, 17, 32, 24, 2, 22, 32, 26, 20, 7, 8, 18, 17, 19, 26, 38, 32, 16, 34, 14, 23, 26, 21, 24, 7, 17, 37, 24, 32, 27, 20, 24, 28, 30, 27, 28, 17, 25, 24, 47, 25, 32, 27, 30],
	"Twitter": [27, 27, 52, 51, 49, 45, 25, 51, 36, 29, 27, 49, 37, 19, 26, 23, 29, 20, 42, 33, 26, 32, 35, 35, 19, 51, 28, 23, 60, 45, 47, 47, 54, 58, 50, 29, 40, 17, 35, 58, 45, 47, 37, 39],
	"WhatsApp": [29, 81, 75, 86, 2, 92, 29, 28, 85, 7, 19, 25, 23, 81, 59, 19, 35, 78, 68, 83, 85, 40, 67, 71, 67, 86, 29, 82, 85, 60, 90, 88, 90, 89, 78, 83, 87, 79, 86, 73, 91, 76, 27, 28]
}

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))

df_long_sm = df.melt(id_vars=["País"], var_name="Red", value_name="Porcentaje")

In [9]:
merged_1 = pd.merge(df_long_social, df_long_use, on="País", how="outer")
df_combined = pd.merge(merged_1, df_long_sm, on="País", how="outer")
df_combined.to_csv(os.path.join(output_datasets, 'social_media_usage_report.csv'), index=False)

In [10]:
app = dash.Dash(__name__)

app.layout = html.Div([
	html.H1("Análisis de Uso de Redes Sociales", style={'textAlign': 'center', 'color': 'white'}),

	dcc.Tabs([
		dcc.Tab(label='Tiempo usado en redes sociales', children=[
			html.H3("Seleccione un país:", style={'color': 'white'}),
			dcc.Dropdown(
				id='dropdown-line',
				options=country_options,
				value=country_options[0]['value'],
				style={'width': '50%'}
			),
			dcc.Graph(id='line-chart')
		]),

		dcc.Tab(label='Porcentaje de uso de cada red', children=[
			html.H3("Seleccione un país:", style={'color': 'white'}),
			dcc.Dropdown(
				id='dropdown-pie-sm',
				options=country_options,
				value=country_options[0]['value'],
				style={'width': '50%'}
			),
			dcc.Graph(id='pie-chart-sm')
		]),

		dcc.Tab(label='Motivo de uso de las redes', children=[
			html.H3("Seleccione un país:", style={'color': 'white'}),
			dcc.Dropdown(
				id='dropdown-pie',
				options=country_options,
				value=country_options[0]['value'],
				style={'width': '50%'}
			),
			dcc.Graph(id='pie-chart')
		])
	])
])

@app.callback(
	Output('line-chart', 'figure'),
	Input('dropdown-line', 'value')
)
def update_line_chart(country_name):
	country_data = df_long_social[df_long_social["País"] == country_name]

	fig = px.line(
		country_data,
		x="Año",
		y="Horas",
		color="País",
		title=f"Tiempo promedio en redes sociales por país: {country_name}",
		labels={"Horas": "Horas por día", "Año": "Año"},
	)

	# Mejora del hover
	fig.update_traces(
		hovertemplate="<b>Año:</b> %{x}<br>" +
					  "<b>Horas por día:</b> %{y:.2f}<br>" +
					  "<b>País:</b> %{customdata[0]}",
		customdata=country_data[["País"]]
	)

	return fig

@app.callback(
	Output('pie-chart-sm', 'figure'),
	Input('dropdown-pie-sm', 'value')
)
def update_pie_chart_sm(country_name):
	country_data = df_long_sm[df_long_sm["País"] == country_name]

	fig = px.pie(
		country_data,
		values="Porcentaje",
		names="Red",
		title=f"Distribución de uso de redes sociales en {country_name}",
		labels={"Porcentaje": "Porcentaje", "Red": "Red Social"},
	)

	fig.update_traces(
		hovertemplate="<b>Red Social:</b> %{label}<br>" +
					  "<b>Porcentaje de uso:</b> %{value:.1f}%<br>" +
					  "<b>País:</b> " + country_name
	)

	return fig

@app.callback(
	Output('pie-chart', 'figure'),
	Input('dropdown-pie', 'value')
)
def update_pie_chart(country_name):
	country_data = df_long_use[df_long_use["País"] == country_name]

	fig = px.pie(
		country_data,
		values="Porcentaje",
		names="Metrica",
		title=f"Motivos de uso de redes sociales en {country_name}",
		labels={"Porcentaje": "Porcentaje", "Metrica": "Motivo"},
	)

	fig.update_traces(
		hovertemplate="<b>Motivo:</b> %{label}<br>" +
					  "<b>Porcentaje:</b> %{value:.1f}%<br>" +
					  "<b>País:</b> " + country_name
	)

	return fig

port_range = range(8040, 8060)
for port in port_range:
	try:
		app.run_server(port=port, debug=False)
		break
	except OSError:
		continue

In [11]:
# url = 'https://es.statista.com/estadisticas/1260093/redes-sociales-porcentaje-de-usuarios-por-edad-en-espana/'
# response = requests.get(url, headers=headers)
# response.raise_for_status()
# soup = BeautifulSoup(response.content, 'html.parser')

# data_chart = soup.find('div', {'id': 'highcharts-29cxzig-0'})

In [12]:
svg = """<svg version="1.1" class="highcharts-root" style="font-family:&quot;Lucida Grande&quot;, &quot;Lucida Sans Unicode&quot;, Arial, Helvetica, sans-serif;font-size:12px;" xmlns="http://www.w3.org/2000/svg" width="705" height="450" viewBox="0 0 705 450"><desc>Created with Highcharts 7.2.2</desc><defs><clipPath id="highcharts-29cxzig-1-"><rect x="0" y="0" width="388" height="606" fill="none"></rect></clipPath><clipPath id="highcharts-29cxzig-8-"><rect x="79" y="10" width="606" height="388" fill="none"></rect></clipPath></defs><rect fill="#ffffff" class="highcharts-background" x="0" y="0" width="705" height="450" rx="0" ry="0"></rect><rect fill="none" class="highcharts-plot-background" x="79" y="10" width="606" height="388"></rect><g class="highcharts-plot-bands-0" data-z-index="0"><path fill="#fafafa" class="highcharts-plot-band " d="M 79 75.5 L 685 75.5 685 139.5 79 139.5 z"></path><path fill="#fafafa" class="highcharts-plot-band " d="M 79 204.5 L 685 204.5 685 269.5 79 269.5 z"></path><path fill="#fafafa" class="highcharts-plot-band " d="M 79 333.5 L 685 333.5 685 398.5 79 398.5 z"></path></g><g class="highcharts-grid highcharts-xaxis-grid" data-z-index="1"><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 75.5 L 685 75.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 139.5 L 685 139.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 204.5 L 685 204.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 269.5 L 685 269.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 333.5 L 685 333.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 398.5 L 685 398.5" opacity="1"></path><path fill="none" stroke-dasharray="none,none" data-z-index="1" class="highcharts-grid-line" d="M 79 10.5 L 685 10.5" opacity="1"></path></g><g class="highcharts-grid highcharts-yaxis-grid" data-z-index="1"><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 78.5 10 L 78.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 139.5 10 L 139.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 199.5 10 L 199.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 260.5 10 L 260.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 320.5 10 L 320.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 381.5 10 L 381.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 442.5 10 L 442.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 502.5 10 L 502.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 563.5 10 L 563.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 623.5 10 L 623.5 398" opacity="1"></path><path fill="none" stroke="#cdcdcd" stroke-width="1" stroke-dasharray="1,1" data-z-index="1" class="highcharts-grid-line" d="M 685.5 10 L 685.5 398" opacity="1"></path></g><rect fill="none" class="highcharts-plot-border" data-z-index="1" x="79" y="10" width="606" height="388"></rect><g class="highcharts-axis highcharts-xaxis" data-z-index="2"><text x="21.8125" data-z-index="7" text-anchor="middle" transform="translate(0,0) rotate(270 21.8125 204)" class="highcharts-axis-title" style="color:#4f4f4f;font-size:10px;font-weight:normal;fill:#4f4f4f;" y="204"><tspan>Años</tspan></text><path fill="none" class="highcharts-axis-line" data-z-index="7" d="M 79 10 L 79 398"></path></g><g class="highcharts-axis highcharts-yaxis" data-z-index="2"><text x="382" data-z-index="7" text-anchor="middle" transform="translate(0,0)" class="highcharts-axis-title" style="color:#808080;font-size:10px;font-weight:normal;fill:#808080;" y="433"><tspan>Porcentaje de población</tspan></text><path fill="none" class="highcharts-axis-line" data-z-index="7" d="M 79 398 L 685 398"></path></g><g class="highcharts-series-group" data-z-index="3"><g data-z-index="0.1" class="highcharts-series highcharts-series-0 highcharts-bar-series highcharts-color-0 highcharts-tracker" transform="translate(685,398) rotate(90) scale(-1,1) scale(1 1)" style="cursor:pointer;" clip-path="url(#highcharts-29cxzig-1-)" width="606" height="388"><rect x="338" y="62" width="37" height="545" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect><rect x="273" y="89" width="37" height="518" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect><rect x="208" y="151" width="37" height="456" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect><rect x="144" y="223" width="37" height="384" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect><rect x="79" y="322" width="37" height="285" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect><rect x="14" y="416" width="37" height="191" fill="rgb(40,118,221)" opacity="1" class="highcharts-point highcharts-color-0"></rect></g><g data-z-index="0.1" class="highcharts-markers highcharts-series-0 highcharts-bar-series highcharts-color-0" transform="translate(685,398) rotate(90) scale(-1,1) scale(1 1)" clip-path="none"></g></g><g data-z-index="3" class="highcharts-data-labels highcharts-series-0 highcharts-bar-series highcharts-color-0 highcharts-tracker" transform="translate(79,10) scale(1 1)" opacity="1" style="cursor:pointer;"><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(544,20)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round" style="">90%</tspan><tspan x="5" y="16">90%</tspan></text></g><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(517,85)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round" style="">85,4%</tspan><tspan x="5" y="16">85,4%</tspan></text></g><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(455,150)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round">75,3%</tspan><tspan x="5" y="16">75,3%</tspan></text></g><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(383,214)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round">63,4%</tspan><tspan x="5" y="16">63,4%</tspan></text></g><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(284,279)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round">47,1%</tspan><tspan x="5" y="16">47,1%</tspan></text></g><g class="highcharts-label highcharts-data-label highcharts-data-label-color-0" data-z-index="1" transform="translate(190,344)" style="text-shadow:null;"><text x="5" data-z-index="1" style="font-size:11px;font-weight:bold;color:#4f4f4f;cursor:pointer;fill:#4f4f4f;" y="16"><tspan x="5" y="16" class="highcharts-text-outline" fill="#FFFFFF" stroke="#FFFFFF" stroke-width="2px" stroke-linejoin="round">31,6%</tspan><tspan x="5" y="16">31,6%</tspan></text></g></g><g class="highcharts-exporting-group" data-z-index="3"></g><text x="348" text-anchor="middle" class="highcharts-title" data-z-index="4" style="color:#333333;font-size:18px;fill:#333333;" y="24"></text><text x="348" text-anchor="middle" class="highcharts-subtitle" data-z-index="4" style="color:#666666;fill:#666666;" y="24"></text><text x="10" text-anchor="start" class="highcharts-caption" data-z-index="4" style="color:#666666;fill:#666666;" y="447"></text><g class="highcharts-plot-lines-6" data-z-index="6"><path fill="none" class="highcharts-plot-line " stroke="#121212" stroke-width="1" d="M 78.5 10 L 78.5 398"></path></g><g class="highcharts-axis-labels highcharts-xaxis-labels" data-z-index="7"><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="47" opacity="1">16-24</text><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="112" opacity="1">25-34</text><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="176" opacity="1">35-44</text><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="241" opacity="1">45-54</text><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="306" opacity="1">55-64</text><text x="66" style="color:#666666;cursor:default;font-size:11px;padding-bottom:0px;line-height:;text-align:right;text-overflow:none;fill:#666666;" text-anchor="end" transform="translate(0,0)" y="370" opacity="1">65-74</text></g><g class="highcharts-axis-labels highcharts-yaxis-labels" data-z-index="7"></g><g class="highcharts-label highcharts-tooltip                                   highcharts-color-0" style="pointer-events:none;white-space:nowrap;z-index:9999;" data-z-index="8" transform="translate(202,-9999)" opacity="0" visibility="visible"><path fill="rgba(255,255,255,1)" class="highcharts-label-box highcharts-tooltip-box" d="M 2.5 0.5 L 160.5 0.5 C 162.5 0.5 162.5 0.5 162.5 2.5 L 162.5 75.5 C 162.5 77.5 162.5 77.5 160.5 77.5 L 2.5 77.5 C 0.5 77.5 0.5 77.5 0.5 75.5 L 0.5 2.5 C 0.5 0.5 0.5 0.5 2.5 0.5" stroke="#aaaaaa" stroke-width="1"></path></g><g class="highcharts-control-points" data-z-index="99" clip-path="url(#highcharts-29cxzig-8-)"></g></svg>"""

In [13]:
datos = [text.get_text() for text in BeautifulSoup(svg)][0]
datos = datos.replace('población', '|').split('|')[1]

ages = re.findall(r'\d{2}-\d{2}', datos)
porcentajes = list(set(re.findall(r'\d{1,3},?\d?%', datos)))
porcentajes = sorted(list(map(lambda x: float(x.replace('%', '').replace(',', '.')), porcentajes)), reverse=True)

df = pd.DataFrame({'Edades': ages, 'Porcentaje': porcentajes})
df.to_csv(os.path.join(output_datasets, 'social_media_usage_by_age.csv'), index=False)

fig = px.bar(df, x='Edades', y='Porcentaje', title='Porcentaje de usuarios de redes sociales por edades en España')

fig.update_traces(
	hovertemplate="<b>Edad:</b> %{x}<br>" +
				  "<b>Porcentaje de usuarios:</b> %{y:.2f}%"
)

fig.show()

In [14]:
estudios = pd.read_csv(os.path.join(output_datasets, 'bmj_tables.csv'))
estudios.columns

Index(['Author and Year', 'Study Design', 'Study Period', 'Country', 'Equity',
       'Participants', 'Mean Age [Range]', 'Risk of Bias', 'Number of dp',
       'MA?', 'Exposure', 'Exposure Measure', 'Outcome Measure', 'N'],
      dtype='object')

In [15]:
estudios[estudios['Exposure Measure'] == 'Seen favourite food advertised on SM']

Unnamed: 0,Author and Year,Study Design,Study Period,Country,Equity,Participants,Mean Age [Range],Risk of Bias,Number of dp,MA?,Exposure,Exposure Measure,Outcome Measure,N
5,Baldwin 2018,Cross-sectional,2014,Australia,High income country with mixed SEP,Adolescents residing in New South Wales,NR [10-16],Low,7,Yes,Exposure to health-risk behaviour content,Seen favourite food advertised on SM,Freq. of unhealthy food consumption,417
7,Baldwin 2018,Cross-sectional,2014,Australia,High income country with mixed SEP,Adolescents residing in New South Wales,NR [10-16],Low,7,Yes,Exposure to health-risk behaviour content,Seen favourite food advertised on SM,Freq. of unhealthy drink consumption,204


In [31]:
estudios['N'] = pd.to_numeric(estudios['N'], errors='coerce')
estudios = estudios.dropna(subset=['N'])
estudios['N'] = estudios['N'].astype(int)
exposiciones = estudios['Exposure Measure'].unique()
autores = estudios['Author and Year'].unique()

app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Efectos de las redes sociales", style={'textAlign': 'center', 'color': 'white'}),

    dcc.Tabs([
        dcc.Tab(label='Resultados por exposición', children=[
            html.H3("Seleccione una exposición:", style={'color': 'white'}),
            dcc.Dropdown(
                id='dropdown-exposure',
                options=[{'label': exp, 'value': exp} for exp in exposiciones],
                value=exposiciones[0],
                style={'width': '50%'}
            ),
            dcc.Graph(id='pie-chart-exposures')
        ]),

        dcc.Tab(label='Distribución de participantes por estudio', children=[
            html.H3("Seleccione un autor:", style={'color': 'white'}),
            dcc.Dropdown(
                id='dropdown-author',
                options=[{'label': author, 'value': author} for author in autores],
                value=autores[0],
                style={'width': '50%'}
            ),
            dcc.Graph(id='bar-chart-participants')
        ]),

        dcc.Tab(label='Relación entre edad media y participantes', children=[
            html.H3("Seleccione una exposición:", style={'color': 'white'}),
            dcc.Dropdown(
                id='scatter-exposure',
                options=[{'label': exp, 'value': exp} for exp in exposiciones],
                value=exposiciones[0],
                style={'width': '50%'}
            ),
            dcc.Graph(id='scatter-plot-age-n')
        ]),

        dcc.Tab(label='Tabla interactiva de estudios', children=[
            html.H3("Datos completos de los estudios:", style={'color': 'white'}),
            DataTable(
                id='table',
                columns=[{'name': col, 'id': col} for col in estudios.columns],
                data=estudios.to_dict('records'),
                filter_action='native',
                sort_action='native',
                style_table={'overflowX': 'auto'},
                style_cell={'textAlign': 'left'},
            )
        ]),

        dcc.Tab(label='Mapa geográfico', children=[
            html.H3("Seleccione una exposición:", style={'color': 'white'}),
            dcc.Dropdown(
                id='geo-exposure',
                options=[{'label': exp, 'value': exp} for exp in exposiciones],
                value=exposiciones[0],
                style={'width': '50%'}
            ),
            dcc.Graph(id='geo-map')
        ]),
    ])
])

# Callback para el gráfico circular
@app.callback(
    Output('pie-chart-exposures', 'figure'),
    Input('dropdown-exposure', 'value')
)
def update_pie_chart(exposure):
    data = estudios[estudios['Exposure Measure'] == exposure]
    fig = px.pie(
        data,
        names='Author and Year',
        values='N',
        title=f"Distribución de estudios para la exposición: {exposure}",
        labels={'N': 'Participantes', 'Author and Year': 'Estudio'}
    )
    fig.update_traces(textinfo='percent+label', hovertemplate="<b>Estudio:</b> %{label}<br><b>Participantes:</b> %{value}<br><b>Porcentaje:</b> %{percent}")
    return fig

# Callback para el gráfico de barras
@app.callback(
    Output('bar-chart-participants', 'figure'),
    Input('dropdown-author', 'value')
)
def update_bar_chart(author):
    data = estudios[estudios['Author and Year'] == author]
    fig = px.bar(
        data,
        x='Exposure Measure',
        y='N',
        title=f"Número de participantes por exposición para {author}",
        labels={'N': 'Participantes', 'Exposure Measure': 'Medida de Exposición'},
        color='Exposure Measure'
    )
    fig.update_layout(xaxis_title="Medida de Exposición", yaxis_title="Número de Participantes")
    return fig

# Callback para el gráfico de dispersión
@app.callback(
    Output('scatter-plot-age-n', 'figure'),
    Input('scatter-exposure', 'value')
)
def update_scatter_plot(exposure):
    data = estudios[estudios['Exposure Measure'] == exposure]
    fig = px.scatter(
        data,
        x='Mean Age [Range]',
        y='N',
        title=f"Relación entre edad media y participantes para {exposure}",
        labels={'Mean Age [Range]': 'Edad Media [Rango]', 'N': 'Número de Participantes'},
        color='Author and Year',
        size='N'
    )
    fig.update_layout(xaxis_title="Edad Media [Rango]", yaxis_title="Número de Participantes")
    return fig

# Callback para el mapa geográfico
@app.callback(
    Output('geo-map', 'figure'),
    Input('geo-exposure', 'value')
)
def update_geo_map(exposure):
    data = estudios[estudios['Exposure Measure'] == exposure]
    fig = px.choropleth(
        data,
        locations='Country',
        locationmode='country names',
        color='N',
        title=f"Distribución geográfica de participantes para {exposure}",
        labels={'N': 'Número de Participantes', 'Country': 'País'}
	)
    return fig


port_range = range(8040, 8060)
for port in port_range:
	try:
		app.run_server(port=port, debug=False)
		break
	except OSError:
		continue	