## 1. Import libraries

In [1]:
import requests
from selenium import webdriver
from time import sleep
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd
import sys
sys.path.append('../')
import numpy as np

## 2. Import Data

In [2]:
df = pd.read_csv("../Data/ultimo_web_scrapping.csv")

## 3. We explore the dataset

In [52]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,Airport_hub,Intercity_train_connectivity,Gun_related_deaths,Guns_per_residents,Pisa_ranking,Best_university,Monthly_Fitness_Club_Membership,Beer,Monthly_Public_Transport,...,Concert_venues,Museums,Sport_venues,Currency_for_urban_area,GDP_per_capita_uno,Growth,GDP_per_capita_otro,Unemployment,url,Average_Annual_percent_sunshine
0,0,8,0.59,1,12,13,Aarhus University,$37.00,$1.70,$62.00,...,95,18,21,DKK,$44342.66,5%,"$44,343",4%,https://teleport.org/cities/aarhus/,No hay dato
1,1,26,0.17,1,24,16,University of Adelaide,$50.00,$4.30,$80.00,...,89,37,91,AUD,$46433.30,7%,"$46,433",6%,https://teleport.org/cities/adelaide/,12.6
2,2,23,0.11,16,328,24,University of New Mexico,$52.00,$2.90,$37.00,...,99,35,10,USD,$54596.65,7%,"$54,597",5%,https://teleport.org/cities/albuquerque/,168


In [4]:
df.shape

(262, 33)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Unnamed: 0                       262 non-null    int64 
 1   Airport_hub                      262 non-null    object
 2   Intercity_train_connectivity     262 non-null    object
 3   Gun_related_deaths               262 non-null    object
 4   Guns_per_residents               262 non-null    object
 5   Pisa_ranking                     262 non-null    object
 6   Best_university                  262 non-null    object
 7   Monthly_Fitness_Club_Membership  261 non-null    object
 8   Beer                             261 non-null    object
 9   Monthly_Public_Transport         261 non-null    object
 10  Lunch                            261 non-null    object
 11  Large_apartment                  262 non-null    object
 12  Medium_apartment                 262

#### Podemos observar que la mayoria de datos son tipo object, por lo que tendremos que convertir las columnas a sus respectivos dtypes.

In [6]:
pd.isna(df).sum()


Unnamed: 0                         0
Airport_hub                        0
Intercity_train_connectivity       0
Gun_related_deaths                 0
Guns_per_residents                 0
Pisa_ranking                       0
Best_university                    0
Monthly_Fitness_Club_Membership    1
Beer                               1
Monthly_Public_Transport           1
Lunch                              1
Large_apartment                    0
Medium_apartment                   0
Small_apartment                    0
cities                             0
Description                        0
Poblacion                          0
Homosexuality_acceptance           0
LGBT_adoption_rights               8
LGBT_homosexuality_rights          0
LGBT_marriage_rights               0
VAT_Sales_Tax                      0
Art_galleries                      0
Concert_venues                     0
Museums                            0
Sport_venues                       0
Currency_for_urban_area            0
G

#### Algunos de los valores presentes son nan, por lo que deberemos reemplazar la información en esos lugares.

In [43]:
#df.LGBT_adoption_rights.unique()

In [39]:
#df.Lunch.unique()

In [40]:
#df.Monthly_Fitness_Club_Membership.unique()

In [41]:
#df.Monthly_Public_Transport.unique()

In [42]:
#df.Beer.unique()

In [12]:
df['Beer'] = df['Beer'].replace(np.nan, 0)

In [13]:
df['Monthly_Fitness_Club_Membership'] = df['Monthly_Fitness_Club_Membership'].replace(np.nan, 0)

In [14]:
df["Lunch"] = df["Lunch"].replace(np.nan, 0)

In [15]:
df["LGBT_adoption_rights"] = df["LGBT_adoption_rights"].replace(np.nan, "No hay dato")

In [16]:
df["Monthly_Public_Transport"] = df["Monthly_Public_Transport"].replace(np.nan, 0)

In [17]:
df["Monthly_Public_Transport"] = df["Monthly_Public_Transport"].replace('No hay dato', 0)

In [18]:
pd.isna(df).sum()


Unnamed: 0                         0
Airport_hub                        0
Intercity_train_connectivity       0
Gun_related_deaths                 0
Guns_per_residents                 0
Pisa_ranking                       0
Best_university                    0
Monthly_Fitness_Club_Membership    0
Beer                               0
Monthly_Public_Transport           0
Lunch                              0
Large_apartment                    0
Medium_apartment                   0
Small_apartment                    0
cities                             0
Description                        0
Poblacion                          0
Homosexuality_acceptance           0
LGBT_adoption_rights               0
LGBT_homosexuality_rights          0
LGBT_marriage_rights               0
VAT_Sales_Tax                      0
Art_galleries                      0
Concert_venues                     0
Museums                            0
Sport_venues                       0
Currency_for_urban_area            0
G

In [19]:
copia = df.copy()

### Para poder trabajar con los datos debemos quitar símbolos y hacer una limpieza de las columnas

In [20]:
def delete_price(i):
    i = str(i).replace('$', "")
    return i

In [21]:
copia['GDP_per_capita_otro'] = copia['GDP_per_capita_otro'].apply(delete_price)

In [22]:
copia['Monthly_Fitness_Club_Membership'] = copia['Monthly_Fitness_Club_Membership'].apply(delete_price)

In [23]:
copia['Beer'] = copia['Beer'].apply(delete_price)

In [24]:
copia['Monthly_Public_Transport'] = copia['Monthly_Public_Transport'].apply(delete_price)

In [25]:
copia['GDP_per_capita_uno'] = copia['GDP_per_capita_uno'].apply(delete_price)

In [26]:
copia['Lunch'] = copia['Lunch'].apply(delete_price)

In [27]:
copia['Large_apartment'] = copia['Large_apartment'].apply(delete_price)

In [28]:
copia['Medium_apartment'] = copia['Medium_apartment'].apply(delete_price)

In [29]:
copia['Small_apartment'] = copia['Small_apartment'].apply(delete_price)

In [30]:
copia.columns

Index(['Unnamed: 0', 'Airport_hub', 'Intercity_train_connectivity',
       'Gun_related_deaths', 'Guns_per_residents', 'Pisa_ranking',
       'Best_university', 'Monthly_Fitness_Club_Membership', 'Beer',
       'Monthly_Public_Transport', 'Lunch', 'Large_apartment',
       'Medium_apartment', 'Small_apartment', 'cities', 'Description',
       'Poblacion', 'Homosexuality_acceptance', 'LGBT_adoption_rights',
       'LGBT_homosexuality_rights', 'LGBT_marriage_rights', 'VAT_Sales_Tax',
       'Art_galleries', 'Concert_venues', 'Museums', 'Sport_venues',
       'Currency_for_urban_area', 'GDP_per_capita_uno', 'Growth',
       'GDP_per_capita_otro', 'Unemployment', 'url',
       'Average_Annual_percent_sunshine'],
      dtype='object')

In [44]:
#copia["Poblacion"].unique()

In [45]:
#copia.head()

In [33]:
def convertir(i):
    i = i.replace(",", ".")
    return i

In [34]:
copia["GDP_per_capita_otro"] = copia["GDP_per_capita_otro"].apply(convertir)

In [35]:
def quitar(i):
    i = i.replace(",", "")
    return i

In [36]:
copia["Poblacion"] = copia["Poblacion"].apply(quitar)

In [46]:
#copia.Poblacion.unique()

In [None]:
## No puedo sacar el simbolo porque es lo que me indica los valores positivos o negativos

In [None]:
def clean_symbol(i):
    i = i.replace("✖", "")
    return i

In [None]:
def clean_symbol2(i):
    i = i.replace("✔", "")
    return i

In [38]:
copia.LGBT_adoption_rights.unique()

array(['✔ Equal', 'No hay dato', 'Ambiguous', '✖ Unequal',
       '✖ Single only', 'Married couples only', '✔ Legal', '✖ Illegal',
       '✖ Step-child adoption only'], dtype=object)

In [47]:
copia.head()

Unnamed: 0.1,Unnamed: 0,Airport_hub,Intercity_train_connectivity,Gun_related_deaths,Guns_per_residents,Pisa_ranking,Best_university,Monthly_Fitness_Club_Membership,Beer,Monthly_Public_Transport,...,Concert_venues,Museums,Sport_venues,Currency_for_urban_area,GDP_per_capita_uno,Growth,GDP_per_capita_otro,Unemployment,url,Average_Annual_percent_sunshine
0,0,8,0.59,1,12,13,Aarhus University,37.0,1.7,62.0,...,95,18,21,DKK,44342.66,5%,44.343,4%,https://teleport.org/cities/aarhus/,No hay dato
1,1,26,0.17,1,24,16,University of Adelaide,50.0,4.3,80.0,...,89,37,91,AUD,46433.3,7%,46.433,6%,https://teleport.org/cities/adelaide/,12.6
2,2,23,0.11,16,328,24,University of New Mexico,52.0,2.9,37.0,...,99,35,10,USD,54596.65,7%,54.597,5%,https://teleport.org/cities/albuquerque/,168
3,3,45,0.66,1,1,39,427.97,83.0,0.68,19.0,...,68,24,53,KZT,24019.95,9%,24.02,6%,https://teleport.org/cities/almaty/,64
4,4,237,0.68,1,4,10,University of Amsterdam,49.0,2.1,100.0,...,339,187,154,EUR,47354.53,3%,47.355,6%,https://teleport.org/cities/amsterdam/,12.9


In [53]:
def clean_no_info(i):
    i = i.replace("No hay dato", "0")
    return i

In [54]:
copia["Airport_hub"] = copia["Airport_hub"].apply(clean_no_info)

In [57]:
copia["Intercity_train_connectivity"] = copia["Intercity_train_connectivity"].apply(clean_no_info)

In [59]:
copia["Gun_related_deaths"] = copia["Gun_related_deaths"].apply(clean_no_info)

In [61]:
copia["Guns_per_residents"] = copia["Guns_per_residents"].apply(clean_no_info)

In [64]:
copia["Pisa_ranking"] = copia["Pisa_ranking"].apply(clean_no_info)

In [73]:
copia["Large_apartment"] = copia["Large_apartment"].apply(clean_no_info)

In [74]:
copia["Small_apartment"] = copia["Small_apartment"].apply(clean_no_info)

In [75]:
copia["Medium_apartment"] = copia["Medium_apartment"].apply(clean_no_info)

In [80]:
copia["VAT_Sales_Tax"] = copia["VAT_Sales_Tax"].apply(clean_no_info)

In [83]:
copia["Art_galleries"] = copia["Art_galleries"].apply(clean_no_info)

In [87]:
copia["Concert_venues"] = copia["Concert_venues"].apply(clean_no_info)

In [86]:
copia["Museums"] = copia["Museums"].apply(clean_no_info)

In [89]:
copia["Sport_venues"] = copia["Sport_venues"].apply(clean_no_info)

In [92]:
copia["GDP_per_capita_uno"] = copia["GDP_per_capita_uno"].apply(clean_no_info)

In [95]:
copia["GDP_per_capita_otro"] = copia["GDP_per_capita_otro"].apply(clean_no_info)

In [97]:
copia["Unemployment"] = copia["Unemployment"].apply(clean_no_info)

In [99]:
copia["Average_Annual_percent_sunshine"] = copia["Average_Annual_percent_sunshine"].apply(clean_no_info)

#### Algunas de las columnas tienen datos que no se corresponden con la medición de esa columna, por lo que reemplzaremos esos números por 0, dandole la asignación de que no tenemos datos sobre ese lugar.

In [108]:
def convert(i):
    if i != :
        i = i.replace(i,0)   
    return i

In [109]:
copia.Airport_hub.unique()

array(['8', '26', '23', '45', '237', '38', '0.10', '42', '10', '104',
       '217', '44', '39', '0.48', '139', '163', '215', '51', '40', '49',
       '142', '152', '102', '76', '16', '64', '103', '169', '7', '149',
       '176', '59', '80', '1', '189', '66', '77', '67', '24', '81', '135',
       '22', '46', '87', '20', '141', '25', '48', '56', '18', '11', '34',
       '126', '30', '190', '97', '17', '137', '116', '144', '117', '79',
       '9', '78', '12', '2', '239', '47', '101', '41', '19', '28', '90',
       '50', '31', '88', '54', '171', '32', '35', '6', '242', '27', '43',
       '52', '115', '71', '133', '159', '155', '160', '357', '62', '158',
       '85', '94', '167', '164', '130', '93', '257', '83', '191', '72',
       '55', '220', '91', '61', '146', '123', '3', '294', '0', '82',
       '317', '15', '121', '57', '36', '68', '109', '111', '4', '29',
       '96', '13', '95', '180', '134', '33', '125', '98', '113', '143',
       '148', '60', '86', '21'], dtype=object)

In [110]:
copia["Airport_hub"] = copia["Airport_hub"].apply(convert_int) 

TypeError: replace() argument 2 must be str, not int

### A continuación debemos convertir el dtype de las columnas

In [49]:
copia.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262 entries, 0 to 261
Data columns (total 33 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Unnamed: 0                       262 non-null    int64 
 1   Airport_hub                      262 non-null    object
 2   Intercity_train_connectivity     262 non-null    object
 3   Gun_related_deaths               262 non-null    object
 4   Guns_per_residents               262 non-null    object
 5   Pisa_ranking                     262 non-null    object
 6   Best_university                  262 non-null    object
 7   Monthly_Fitness_Club_Membership  262 non-null    object
 8   Beer                             262 non-null    object
 9   Monthly_Public_Transport         262 non-null    object
 10  Lunch                            262 non-null    object
 11  Large_apartment                  262 non-null    object
 12  Medium_apartment                 262

In [None]:
## Con estas columnas hay diversos problemas a solucionar

In [94]:
copia.GDP_per_capita_otro.unique()

array(['44.343', '46.433', '54.597', '24.020', '47.355', '4%', '19.610',
       '8.449', '25.859', '35.152', '17.618', '10.641', 'No hay dato',
       '14.354', '33.711', '12.880', '17.986', '39.511', '13.329',
       '5.855', '66.937', '45.888', '58.087', '13.430', '35.486',
       '40.375', '28.175', '29.925', '42.973', '19.712', '24.942',
       '22.582', '10.877', '44.843', '13.046', '17.695', '7.606', '4.979',
       '49.195', '16.096', '2.667', '143.427', '64.479', '37.390',
       '25.105', '3%', '45.986', '17.881', '7.503', '11.950', '40.347',
       '5.635', '54.722', '46.420', '2.376', '8.668', '8.609', '24.654',
       '6.031', '6.221', '11.817', '26.975', '29.658', '92.049', '4.736',
       '6.962', '18.161', '20.556', '24.805', '3.084', '30.769', '19.455',
       '3.263', '11.244', '43.637', '23.707', '52.183', '14.864', '14%',
       '8.021', '22.971', '13.012', '9.833', '35.277', '82.762', '13.349',
       '17.860', '45.854', '26.999', '5.609', '7.653', '17.114', '32.691

In [93]:
copia.Growth.unique()

array(['5%', '7%', '9%', '3%', '4%', '-1%', '13%', '6%', '12%', '-2%',
       '10%', '16%', '8%', '80', '0%', '18%', '1%', '82', '2%', '43%',
       '-4%', '11%', '127%', '14%', '-41%', '34%', '-3%', '-39%', '62%',
       '17%', '-11%', '36%', '-5%', '-6%'], dtype=object)

In [90]:
copia.Currency_for_urban_area.unique()

array(['DKK', 'AUD', 'USD', 'KZT', 'EUR', 'TRY', 'PYG', 'NZD', 'AZN',
       'IDR', '0.43', 'THB', 'CNY', 'LBP', 'GBP', 'RSD', 'INR', 'NOK',
       'CHF', 'COP', 'CZK', 'RON', 'HUF', 'ARS', 'EGP', 'CAD', 'ZAR',
       'VEF', 'MAD', 'MDL', 'BRL', 'TZS', 'QAR', 'AED', 'JPY', 'PLN',
       'GIP', 'SEK', 'MXN', 'GTQ', 'CUC', 'VND', 'HKD', 'NPR', 'UAH',
       'JMD', 'MYR', 'NGN', 'BOB', 'PEN', 'NIO', 'PHP', 'BYN', 'UYU',
       'RUB', 'KES', 'No hay dato', 'PAB', 'KHR', 'ISK', 'SAR', 'CRC',
       'CLP', 'DOP', 'BAM', 'KRW', 'SGD', 'MKD', 'BGN', 'TWD', 'UZS',
       'GEL', 'IRR', 'ILS', 'TND', 'AMD', 'HRK'], dtype=object)

#### Por ahora, nos centremos en el resto de columnas que ya podemos convertir

In [105]:
def convert_int(i):
    if i != int(i):
        i = i.replace(i,0)   
    return i

In [101]:
copia.Airport_hub.unique()

array(['8', '26', '23', '45', '237', '38', '0.10', '42', '10', '104',
       '217', '44', '39', '0.48', '139', '163', '215', '51', '40', '49',
       '142', '152', '102', '76', '16', '64', '103', '169', '7', '149',
       '176', '59', '80', '1', '189', '66', '77', '67', '24', '81', '135',
       '22', '46', '87', '20', '141', '25', '48', '56', '18', '11', '34',
       '126', '30', '190', '97', '17', '137', '116', '144', '117', '79',
       '9', '78', '12', '2', '239', '47', '101', '41', '19', '28', '90',
       '50', '31', '88', '54', '171', '32', '35', '6', '242', '27', '43',
       '52', '115', '71', '133', '159', '155', '160', '357', '62', '158',
       '85', '94', '167', '164', '130', '93', '257', '83', '191', '72',
       '55', '220', '91', '61', '146', '123', '3', '294', '0', '82',
       '317', '15', '121', '57', '36', '68', '109', '111', '4', '29',
       '96', '13', '95', '180', '134', '33', '125', '98', '113', '143',
       '148', '60', '86', '21'], dtype=object)

In [104]:
copia.Airport_hub

0        8
1       26
2       23
3       45
4      237
      ... 
257     23
258     31
259     35
260     27
261    137
Name: Airport_hub, Length: 262, dtype: object

In [102]:
copia["Airport_hub"] = copia["Airport_hub"].apply(convert_int)

ValueError: invalid literal for int() with base 10: '0.10'