tEST

# Importing Libraries and Dataframes

In [4]:
from dataextraction import *

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

There is a big amount of dataframes to check, this loop will help to keep track of these imported variables.

In [3]:
imported_variables = [name for name in dir() if name.endswith('df')] # list of variables that end with "df"

for variable in imported_variables:
    print(variable)


acceso_internet_fijo_por_rangos_velocidad_bajada_provincia_df
acceso_internet_fijo_por_tecnologia_provincia_df
accesos_banda_ancha_banda_angosta_por_provincia_df
accesos_internet_fijo_por_tecnología_localidad_df
accesos_internet_fijo_por_velocidad_bajada_provincia_df
accesos_internet_fijo_por_velocidad_localidad_df
conectividad_al_servicio_de_internet_df
indicadores_macroeconomicos_df
ingresos_por_operación_servicio_internet_fijo_df
listado_localidades_conectividad_internet_df
penetracion_internet_fijo_por_provincia_cada_100_habitantes_df
penetracion_internet_fijo_por_provincia_cada_100_hogares_df
penetracion_nacional_internet_fijo_df
total_nacional_accesos_internet_fijo_por_banda_ancha_banda_angosta_df
total_nacional_accesos_internet_fijo_por_tecnologia_df
total_nacional_accesos_internet_fijo_por_velocidad_bajada_df
velocidad_media_bajada_internet_nacional_df
velocidad_media_bajada_internet_por_provincia_df


# Setting up the data set

## Creating a general table for regional and national data

The dataframes `penetracion_internet_fijo_por_provincia_cada_100_habitantes_df` and `penetracion_internet_fijo_por_provincia_cada_100_hogares_df` are exactly the same except for one column, so they can be merged in one table.

In [29]:
# dropping the last column of each df, checking if they have the same data except for that column

penetracion_internet_fijo_por_provincia_cada_100_habitantes_df.drop(columns="Accesos por cada 100 hab") \
.equals(penetracion_internet_fijo_por_provincia_cada_100_hogares_df.drop(columns="Accesos por cada 100 hogares"))

True

In [33]:
penetracion_regional_internet_fijo_df = penetracion_internet_fijo_por_provincia_cada_100_hogares_df \
.merge(penetracion_internet_fijo_por_provincia_cada_100_habitantes_df, how="inner", on=["Año", "Trimestre", "Provincia"])

penetracion_regional_internet_fijo_df.head()

Unnamed: 0,Año,Trimestre,Provincia,Accesos por cada 100 hogares,Accesos por cada 100 hab
0,2022,3,Buenos Aires,78.11,26
1,2022,3,Capital Federal,122.28,50
2,2022,3,Catamarca,65.33,17
3,2022,3,Chaco,43.86,12
4,2022,3,Chubut,84.38,26


In [34]:
penetracion_regional_internet_fijo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 5 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Año                           840 non-null    object
 1   Trimestre                     840 non-null    object
 2   Provincia                     840 non-null    object
 3   Accesos por cada 100 hogares  840 non-null    object
 4   Accesos por cada 100 hab      840 non-null    object
dtypes: object(5)
memory usage: 32.9+ KB


In [5]:
# the last column is redundant because there is already a Trimestre (quarter) column
penetracion_nacional_internet_fijo_df.head()

Unnamed: 0,Año,Trimestre,Accesos por cada 100 hogares,Accesos por cada 100 hab,Periodo
0,2022,3,76.64,23.95,Jul-Sept 2022
1,2022,2,75.97,23.72,Abr-Jun 2022
2,2022,1,73.88,23.05,Ene-Mar 2022
3,2021,4,73.18,22.81,Oct-Dic 2021
4,2021,3,70.58,21.98,Jul-Sept 2021


In [6]:
penetracion_nacional_internet_fijo_df = penetracion_nacional_internet_fijo_df.drop(columns="Periodo")

We can go even further, in order to have more consolidated data, it's possible to add more attributes to these tables. Here we are going to do it for the national data.

In [50]:
# the last column of the second dataframe need to be dropped because it's duplicated 
# so the new df needs to be a sliced version of this one


general_nacional = penetracion_nacional_internet_fijo_df \
.merge(velocidad_media_bajada_internet_nacional_df.iloc[:, 0:3], how="left", on=["Año", "Trimestre"]) \
.merge(ingresos_por_operación_servicio_internet_fijo_df.drop(columns="Periodo"), how="left", on=["Año", "Trimestre"]) \
.merge(indicadores_macroeconomicos_df, how="left", on=["Año", "Trimestre"]) \
.merge(total_nacional_accesos_internet_fijo_por_banda_ancha_banda_angosta_df.drop(columns="Periodo"), how="left", on=["Año", "Trimestre"])

general_nacional.head()

Unnamed: 0,Año,Trimestre,Accesos por cada 100 hogares,Accesos por cada 100 hab,Mbps (Media de bajada),Ingresos (miles de pesos),IPC US,IPC AR,USDARS oficial,USDARS blue,PBI millones,Banda ancha fija,Dial up,Total
0,2022,3,76.64,23.95,62.46,67055930,,,,,,11078691,12437,11091128
1,2022,2,75.97,23.72,58.44,60335724,,,,,,10946248,12436,10958684
2,2022,1,73.88,23.05,55.11,55589997,,,,,,10611390,12619,10624009
3,2021,4,73.18,22.81,52.34,45467887,,,,,,10476933,12861,10489794
4,2021,3,70.58,21.98,48.46,42999944,,,,,,10075184,10357,10085541


There are a lot of null values, because there aren't macroeconomic indicators after 2018.

In [51]:
general_nacional = general_nacional.replace(",", "", regex=True) # need regex=True to find commas into the numbers

In [52]:
general_nacional.to_csv('../processed_data/general_nacional.csv', index=False)

Now we do the same with regional data. In this case, there is no economic data at regional level.

In [55]:
general_provincia = penetracion_regional_internet_fijo_df \
.merge(velocidad_media_bajada_internet_por_provincia_df, how="left", on=["Año", "Trimestre", "Provincia"]) \
.merge(accesos_banda_ancha_banda_angosta_por_provincia_df, how="left", on=["Año", "Trimestre", "Provincia"])

general_provincia.head()

Unnamed: 0,Año,Trimestre,Provincia,Accesos por cada 100 hogares,Accesos por cada 100 hab,Mbps (Media de bajada),Banda ancha fija,Dial up,Total
0,2022,3,Buenos Aires,78.11,26,70.19,4715469,6199,4721668
1,2022,3,Capital Federal,122.28,50,101.05,1545534,2145,1547679
2,2022,3,Catamarca,65.33,17,60.99,70292,1,70293
3,2022,3,Chaco,43.86,12,53.21,144141,5,144146
4,2022,3,Chubut,84.38,26,15.57,164874,904,165778


In [56]:
general_provincia = general_provincia.replace(",", "", regex=True) # need regex=True to find commas into the numbers

In [57]:
general_provincia.to_csv('../processed_data/general_provincia.csv', index=False)

## Download Speed by Range

In [None]:
total_nacional_accesos_internet_fijo_por_velocidad_bajada_df.head()

Unnamed: 0,Año,Trimestre,Hasta 512 kbps,Entre 512 Kbps y 1 Mbps,Entre 1 Mbps y 6 Mbps,Entre 6 Mbps y 10 Mbps,Entre 10 Mbps y 20 Mbps,Entre 20 Mbps y 30 Mbps,Más de 30 Mbps,OTROS,Total,Unnamed: 12
0,2022,3,33013,96727,1161370,1151906,823505,479822,6995750,349035,11091128,
1,2022,2,33667,99498,1193090,1197030,856562,485321,6741922,351594,10958684,
2,2022,1,34890,104840,1263273,1209148,967508,509830,6336187,198333,10624009,
3,2021,4,41262,28521,1413208,1245333,976539,558358,6032322,194251,10489794,
4,2021,3,40174,41437,2550229,1095772,710122,536364,4948174,163269,10085541,


In [45]:
total_nacional_accesos_internet_fijo_por_velocidad_bajada_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Año                      35 non-null     object
 1   Trimestre                35 non-null     object
 2   Hasta 512 kbps           35 non-null     object
 3   Entre 512 Kbps y 1 Mbps  35 non-null     object
 4   Entre 1 Mbps y 6 Mbps    35 non-null     object
 5   Entre 6 Mbps y 10 Mbps   35 non-null     object
 6   Entre 10 Mbps y 20 Mbps  35 non-null     object
 7   Entre 20 Mbps y 30 Mbps  35 non-null     object
 8   Más de 30 Mbps           35 non-null     object
 9   OTROS                    35 non-null     object
 10  Total                    35 non-null     object
 11                           35 non-null     object
dtypes: object(12)
memory usage: 3.4+ KB


In [None]:
acceso_internet_fijo_por_rangos_velocidad_bajada_provincia_df.head()

Unnamed: 0,Año,Trimestre,Provincia,HASTA 512 kbps,+ 512 Kbps - 1 Mbps,+ 1 Mbps - 6 Mbps,+ 6 Mbps - 10 Mbps,+ 10 Mbps - 20 Mbps,+ 20 Mbps - 30 Mbps,+ 30 Mbps,OTROS,Total
0,2022,3,Buenos Aires,29985,27709,290315,297915,267044,124190,3618689,65821,4721668
1,2022,3,Capital Federal,517,5742,34371,67829,51946,28692,1253105,105477,1547679
2,2022,3,Catamarca,71,384,3107,5389,5099,3737,50298,2208,70293
3,2022,3,Chaco,461,987,16782,18938,8049,15828,79390,3711,144146
4,2022,3,Chubut,109,1444,45707,30940,34682,15309,17563,20024,165778


In [46]:
acceso_internet_fijo_por_rangos_velocidad_bajada_provincia_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840 entries, 0 to 839
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Año                  840 non-null    object
 1   Trimestre            840 non-null    object
 2   Provincia            840 non-null    object
 3   HASTA 512 kbps       840 non-null    object
 4   + 512 Kbps - 1 Mbps  840 non-null    object
 5   + 1 Mbps - 6 Mbps    840 non-null    object
 6   + 6 Mbps - 10 Mbps   840 non-null    object
 7   + 10 Mbps - 20 Mbps  840 non-null    object
 8   + 20 Mbps - 30 Mbps  840 non-null    object
 9   + 30 Mbps            840 non-null    object
 10  OTROS                840 non-null    object
 11  Total                840 non-null    object
dtypes: object(12)
memory usage: 78.9+ KB


### Merging

In [73]:
velocidad_nacional_internet_fijo = total_nacional_accesos_internet_fijo_por_velocidad_bajada_df \
.merge(velocidad_media_bajada_internet_nacional_df, how="left", on=["Año", "Trimestre"])

velocidad_nacional_internet_fijo.head()

Unnamed: 0,Año,Trimestre,Hasta 512 kbps,Entre 512 Kbps y 1 Mbps,Entre 1 Mbps y 6 Mbps,Entre 6 Mbps y 10 Mbps,Entre 10 Mbps y 20 Mbps,Entre 20 Mbps y 30 Mbps,Más de 30 Mbps,OTROS,Total,Unnamed: 12,Mbps (Media de bajada)
0,2022,3,33013,96727,1161370,1151906,823505,479822,6995750,349035,11091128,,62.46
1,2022,2,33667,99498,1193090,1197030,856562,485321,6741922,351594,10958684,,58.44
2,2022,1,34890,104840,1263273,1209148,967508,509830,6336187,198333,10624009,,55.11
3,2021,4,41262,28521,1413208,1245333,976539,558358,6032322,194251,10489794,,52.34
4,2021,3,40174,41437,2550229,1095772,710122,536364,4948174,163269,10085541,,48.46


In [75]:
velocidad_regional_internet_fijo = acceso_internet_fijo_por_rangos_velocidad_bajada_provincia_df \
.merge(velocidad_media_bajada_internet_por_provincia_df, how="left", on=["Año", "Trimestre", "Provincia"])

velocidad_regional_internet_fijo.head()

Unnamed: 0,Año,Trimestre,Provincia,HASTA 512 kbps,+ 512 Kbps - 1 Mbps,+ 1 Mbps - 6 Mbps,+ 6 Mbps - 10 Mbps,+ 10 Mbps - 20 Mbps,+ 20 Mbps - 30 Mbps,+ 30 Mbps,OTROS,Total,Mbps (Media de bajada),Unnamed: 14,Unnamed: 15
0,2022,3,Buenos Aires,29985,27709,290315,297915,267044,124190,3618689,65821,4721668,70.19,,
1,2022,3,Capital Federal,517,5742,34371,67829,51946,28692,1253105,105477,1547679,101.05,,
2,2022,3,Catamarca,71,384,3107,5389,5099,3737,50298,2208,70293,60.99,,
3,2022,3,Chaco,461,987,16782,18938,8049,15828,79390,3711,144146,53.21,,
4,2022,3,Chubut,109,1444,45707,30940,34682,15309,17563,20024,165778,15.57,,


In [76]:
velocidad_nacional_internet_fijo = velocidad_nacional_internet_fijo.replace(",", "", regex=True) # need regex=True to find commas into the numbers

In [78]:
velocidad_regional_internet_fijo = velocidad_regional_internet_fijo.replace(",", "", regex=True) # need regex=True to find commas into the numbers

In [79]:
velocidad_nacional_internet_fijo.to_csv('../processed_data/velocidad_nacional_internet_fijo.csv', index=False)

In [80]:
velocidad_regional_internet_fijo.to_csv('../processed_data/velocidad_regional_internet_fijo.csv', index=False)

Now that the speed range is being used, the following dataframe is not needed anymore. Also, in this case, it's more useful analyzing a range of numbers that doing it for specific figures. For example: it's not very helpful to visualize the distinction between 0.5 and 0.512 Mbps.

In [22]:
accesos_internet_fijo_por_velocidad_bajada_provincia_df.head()

Unnamed: 0,Año,Trimestre,Provincia,Otros,"0,256 Mbps","0,375 Mbps","0,625 Mbps","0,5 Mbps","0,512 Mbps","0,75 Mbps",...,70 Mbps,71 Mbps,75 Mbps,77 Mbps,78 Mbps,80 Mbps,81 Mbps,83 Mbps,82 Mbps,86 Mbps
0,2022,3,Buenos Aires,65821,19,- 0,- 0,29774,192,4611,...,3163,- 0,98387,- 0,2,223,- 0,1,- 0,- 0
1,2022,3,Capital Federal,105477,31,- 0,- 0,451,35,1307,...,3,- 0,30482,- 0,- 0,8,- 0,- 0,- 0,- 0
2,2022,3,Catamarca,2208,- 0,- 0,- 0,34,37,- 0,...,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0
3,2022,3,Chaco,3711,- 0,225,111,234,2,- 0,...,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0
4,2022,3,Chubut,20024,8,- 0,- 0,69,32,836,...,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0,- 0


## Type of connection

In [59]:
total_nacional_accesos_internet_fijo_por_tecnologia_df.head()

Unnamed: 0,Año,Trimestre,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total,Periodo
0,2022,3,1395277,6031970,2871541,557110,235230,11091128,Jul-Sept 2022
1,2022,2,1468333,5979214,2723285,556243,231609,10958684,Abr-Jun 2022
2,2022,1,1533240,6073426,2219533,545814,251996,10624009,Ene-Mar 2022
3,2021,4,1657615,5984240,2072236,523107,252596,10489794,Oct-Dic 2021
4,2021,3,1950631,5826257,1566048,492415,250190,10085541,Jul-Sept 2021


In [67]:
nacional_tipo_conexion_economia = total_nacional_accesos_internet_fijo_por_tecnologia_df.drop(columns="Periodo") \
.merge(ingresos_por_operación_servicio_internet_fijo_df.drop(columns="Periodo"), how="left", on=["Año", "Trimestre"]) \
.merge(indicadores_macroeconomicos_df, how="left", on=["Año", "Trimestre"]) \

nacional_tipo_conexion_economia.head()

Unnamed: 0,Año,Trimestre,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total,Ingresos (miles de pesos),IPC US,IPC AR,USDARS oficial,USDARS blue,PBI millones
0,2022,3,1395277,6031970,2871541,557110,235230,11091128,67055930,,,,,
1,2022,2,1468333,5979214,2723285,556243,231609,10958684,60335724,,,,,
2,2022,1,1533240,6073426,2219533,545814,251996,10624009,55589997,,,,,
3,2021,4,1657615,5984240,2072236,523107,252596,10489794,45467887,,,,,
4,2021,3,1950631,5826257,1566048,492415,250190,10085541,42999944,,,,,


In [68]:
nacional_tipo_conexion_economia = nacional_tipo_conexion_economia.replace(",", "", regex=True) # need regex=True to find commas into the numbers

In [69]:
nacional_tipo_conexion_economia.to_csv('../processed_data/nacional_tipo_conexion_economia.csv', index=False)

There is no economic information at regional level. The dataframe will be saved as it is

In [76]:
acceso_internet_fijo_por_tecnologia_provincia_df.head()

Unnamed: 0,Año,Trimestre,Provincia,ADSL,Cablemodem,Fibra óptica,Wireless,Otros,Total
0,2022,3,Buenos Aires,339648,2748325,1436433,126846,70416,4721668
1,2022,3,Capital Federal,140791,1240125,129218,5758,31787,1547679
2,2022,3,Catamarca,10010,10495,46224,1329,2235,70293
3,2022,3,Chaco,27164,61800,44645,8178,2359,144146
4,2022,3,Chubut,45377,72212,9574,29784,8831,165778


In [77]:
acceso_internet_fijo_por_tecnologia_provincia_df.to_csv('../processed_data/regional_tipo_conexion.csv', index=False)

## Data by location

This data set contains information about the whole nation, locations, communities and states. The scope of this project include only national and state data, that means minor territorial divisions will be ignored, they won't be useful in the general analysis of the telecommunication status of the country. Maybe, in the future, a more focused study could use this administrative divisions and get more local insights.

In [20]:
accesos_internet_fijo_por_tecnología_localidad_df.head()

Unnamed: 0,Provincia,Partido,Localidad,Link Indec,ADSL,CABLEMODEM,DIAL UP,FIBRA OPTICA,OTROS,SATELITAL,WIMAX,WIRELESS,Total general,Unnamed: 14
0,BUENOS AIRES,25 de Mayo,25 de Mayo,6854100,873,4704,- 0,2,- 0,975,- 0,664,7218,
1,BUENOS AIRES,25 de Mayo,Del Valle,6854020,181,- 0,- 0,- 0,10,1,- 0,- 0,192,
2,BUENOS AIRES,25 de Mayo,Gobernador Ugarte,6854040,- 0,- 0,- 0,- 0,- 0,- 0,- 0,181,181,
3,BUENOS AIRES,25 de Mayo,Norberto de la Riestra,6854060,- 0,782,- 0,6,167,- 0,- 0,327,1282,
4,BUENOS AIRES,25 de Mayo,Lucas Monteverde,6854050,- 0,- 0,- 0,- 0,- 0,- 0,- 0,6,6,


In [21]:
accesos_internet_fijo_por_velocidad_localidad_df.head()

Unnamed: 0,Provincia,Partido,Localidad,Link Indec,Otros,"0,256 mbps","0,375 mbps","0,5 mbps","0,512 mbps","0,625 mbps",...,75 mbps,78 mbps,80 mbps,82 mbps,83 mbps,85 mbps,90 mbps,92 mbps,95 mbps,100 mbps
0,BUENOS AIRES,25 de Mayo,25 de Mayo,6854100,,,,2.0,,,...,,,9.0,,,,,,,
1,BUENOS AIRES,25 de Mayo,Agustín Mosconi,6854010,,,,,,,...,,,,,,,,,,
2,BUENOS AIRES,25 de Mayo,Del Valle,6854020,,,,1.0,,,...,,,,,,,,,,
3,BUENOS AIRES,25 de Mayo,Ernestina,6854030,,,,,,,...,,,,,,,,,,
4,BUENOS AIRES,25 de Mayo,Gobernador Ugarte,6854040,,,,,,,...,,,,,,,,,,


## About connectivity

The dataframes that involve connectivity by location have a some issues like unnamed columns and a lot of blank spaces and null values, so they aren't going to be used because will not add value to the analysis. 

In [10]:
listado_localidades_conectividad_internet_df.head()

Unnamed: 0,Unnamed: 1,Partido,Localidad,ADSL,Unnamed: 5,DIALUP,FIBRAOPTICA,4G,Unnamed: 9,TELEFONIAFIJA,Unnamed: 11,SATELITAL
0,,Bahía Blanca,Villa Bordeau,,,--,--,,,--,,--
1,BUENOS AIRES,,Villa Espora,--,--,,--,--,--,,--,
2,BUENOS AIRES,,,SI,SI,,,SI,SI,,SI,
3,BUENOS AIRES,Balcarce,,--,--,--,,--,--,SI,SI,--
4,BUENOS AIRES,,Napaleofú,--,--,,SI,SI,SI,,,--


In [12]:
listado_localidades_conectividad_internet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4312 entries, 0 to 4311
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0                  4312 non-null   object
 1   Partido        4312 non-null   object
 2   Localidad      4312 non-null   object
 3   ADSL           4312 non-null   object
 4                  4312 non-null   object
 5   DIALUP         4312 non-null   object
 6   FIBRAOPTICA    4312 non-null   object
 7   4G             4312 non-null   object
 8                  4312 non-null   object
 9   TELEFONIAFIJA  4312 non-null   object
 10                 4312 non-null   object
 11  SATELITAL      4312 non-null   object
dtypes: object(12)
memory usage: 404.4+ KB


In [13]:
conectividad_al_servicio_de_internet_df.head()

Unnamed: 0,Unnamed: 1,Partido,Localidad,Unnamed: 4,Unnamed: 5,CABLEMODEM,DIALUP,Unnamed: 8,Unnamed: 9,WIRELESS,TELEFONIAFIJA,Unnamed: 12,Unnamed: 13,link,Latitud,Unnamed: 16
0,,Bahía Blanca,Villa Bordeau,0,,--,--,--,,--,--,,SI,6056020.0,-386472605094596.0,
1,BUENOS AIRES,,,0,--,,,--,--,,,--,--,,-387765069529222.0,-621851833537179.0
2,BUENOS AIRES,,Balcarce,38376,SI,,SI,SI,--,,,SI,SI,6063010.0,,-582551665841248.0
3,BUENOS AIRES,,,337,--,,,--,--,,SI,,--,6063020.0,-379412057603.0,
4,BUENOS AIRES,Balcarce,,374,--,--,,SI,--,SI,SI,SI,,6063030.0,-376254980210026.0,-587461862359423.0


In [14]:
conectividad_al_servicio_de_internet_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4312 entries, 0 to 4311
Data columns (total 16 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0                  4312 non-null   object
 1   Partido        4312 non-null   object
 2   Localidad      4312 non-null   object
 3                  4312 non-null   object
 4                  4312 non-null   object
 5   CABLEMODEM     4312 non-null   object
 6   DIALUP         4312 non-null   object
 7                  4312 non-null   object
 8                  4312 non-null   object
 9   WIRELESS       4312 non-null   object
 10  TELEFONIAFIJA  4312 non-null   object
 11                 4312 non-null   object
 12                 4312 non-null   object
 13  link           4312 non-null   object
 14  Latitud        4312 non-null   object
 15                 4312 non-null   object
dtypes: object(16)
memory usage: 539.1+ KB
