![Images](Images/ITAcademy.png)
# CIÈNCIA DE DADES: M10.T01
#### Created by: Dani Planas Montserrat


In [1]:

# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import statistics
#Import library MySql
import mysql.connector

#Import Library Preprocessing and modeling
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm
import statsmodels.formula.api as smf


# Import Library
from ip2geotools.databases.noncommercial import DbIpCity


warnings.filterwarnings('ignore')


In [2]:
#************************************************************************************
#***************************     READ Files Functions      **************************
#************************************************************************************
#Function for Read *.csv file (Import Data)
def openFileTXT(path_name):
    
    # Read *.csv file (Import Data)
    df = pd.read_csv(path_name, sep=',', names=movie_column_l, encoding = 'ISO-8859-1', index_col=0, engine='python')

    return df

#Function for Read *.txt file (Import Data)
def openFileTable(path_name, file_name, separator, movie_column_l):

    # Read *.txt file (Import Data)
    df = pd.read_table(path_name + file_name, sep=separator, names=movie_column_l, header=None,  engine='python')
    
    return df

#Function for Read *.parquet file (Import Data)
def openFileParquet(path_name, file_name):
    
    #Import library pyarrow for read files with extenison "parquet" 
    import pyarrow 
    # Read *.txt file (Import Data)
    #df = pd.read_table(path_name + file_name, sep=separator, names=movie_column_l, header=None,  engine='python')
    df = pd.read_parquet(path_name + file_name, engine='pyarrow')
    
    return df




In [3]:
#************************************************************************************
#******************************      PLOT Functions     *****************************
#************************************************************************************
#Function plot, type Histogram (one categoric value)
def plotHistogram(df):
    
    # create a dataset
    height = df
    bars = df.index
    x_pos = np.arange(len(bars))

    # Create bars with different colors
    #plt.bar(x_pos, height, color=['black', 'red', 'green', 'blue', 'cyan'])
    plt.bar(x_pos, height, color=['green'])
    # Create names on the x-axis
    plt.xticks(x_pos,bars)

    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

#Function plot, type Histogram (one Numeric value)
def plotHistogram2(df,paramNum):
    
    # create a dataset
    height = df[paramNum]
    bars = df.index
    x_pos = np.arange(len(bars))

    # Create bars with different colors
    #plt.bar(x_pos, height, color=['black', 'red', 'green', 'blue', 'cyan'])
    plt.bar(x_pos, height, color=['yellow'])
    # Create names on the x-axis
    plt.xticks(x_pos,bars)

    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

def plotDensityPlot(df):
    # library & dataset
    import seaborn as sns
    
    # set a grey background (use sns.set_theme() if seaborn version 0.11.0 or above) 
    sns.set(style="darkgrid")

    # plotting both distibutions on the same figure
    fig = sns.kdeplot(df, shade=True, color="r")
    #fig = sns.kdeplot(df[paramNum2], shade=True, color="b")

    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

#Function plot, with Seaborn Library and type ScatterPlot
def plotSnsScatterplot(df, paramNum1, paramNum2, title, ylabel, xlabel):

    g = sns.FacetGrid(df)
    g.map(sns.scatterplot, paramNum1, paramNum2, alpha=.7)
    g.add_legend()

    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    
    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

#Function plot, with Seaborn Library and type LinePlot
def plotSnsLineplot(df, paramNum1, paramNum2, title, ylabel, xlabel):

    g = sns.FacetGrid(df)
    g.map(sns.lineplot, paramNum1, paramNum2, alpha=.7)
    g.add_legend()
    
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    
    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

#Function plot, with Seaborn Library and type HistPlot
def plotSnsHistplot(df, paramNum1, title, ylabel, xlabel, binsNum):
  
    sns.histplot(data=df, x=paramNum1, bins=binsNum, kde=True)
    #g.add_legend()
    
    plt.title(title)
    plt.ylabel(ylabel)
    plt.xlabel(xlabel)
    
    # Show graphic
    fig = plt.gcf()
    plt.show()

    return fig

#Function plot, type Stacked Area Chart (with 3 numeric values and 1 categoric value)   
def plotBarplotMultiple(df, paramNum1, paramNum2, paramNum3, Sel3):
        # libraries
        import numpy as np
        import matplotlib.pyplot as plt

        # width of the bars
        barWidth = 0.3
        
        # Choose the height of the blue bars
        bars1 = df

        if (Sel3 == 2):
            # Choose the height of the cyan bars
            bars2 = df[paramNum2] 

        if (Sel3 == 3):
            # Choose the height of the cyan bars
            bars3 = df[paramNum3] 
        
        # Choose the height of the error bars (bars1)
        yer1 = np.arange(len(bars1))

        # Choose the height of the error bars (bars2)
        yer2 = np.arange(len(bars1))
        
        #print(title) by groups of bars
        title = df.index 
        
        # The x position of bars
        r1 = np.arange(len(bars1))
        r2 = [x + barWidth for x in r1]
        r3 = [x + barWidth for x in r2]

        # Create blue bars
        plt.bar(r1, bars1, width = barWidth, color = 'yellow', edgecolor = 'black', yerr=yer1, capsize=5, label=paramNum1)
        
        if (Sel3 == 2):
            # Create cyan bars
            plt.bar(r2, bars2, width = barWidth, color = 'yellow', edgecolor = 'black', yerr=yer2, capsize=7, label=paramNum2)

        if (Sel3 == 3):
             # Create cyan bars
             plt.bar(r3, bars3, width = barWidth, color = 'green', edgecolor = 'black', yerr=yer2, capsize=7, label=paramNum3)
   
        # general layout
        plt.xticks([r + barWidth for r in range(len(bars1))], title)
        plt.ylabel('height')
        #plt.legend()

        # Show graphic
        fig = plt.gcf()
        plt.show()

        return fig

#Function plot, show a World Map with Geolocation points from a dataset
def plotWorldMapGeolocPoints(df, pLatitud, pLongitud):

        bars3 = df[pLatitud] 

        #Dibujar mapas
        from mpl_toolkits.basemap.test import Basemap
        import matplotlib.pyplot as plt


        #Figure Size
        plt.figure(figsize=(16,12))

        #Projection Type : robin, ortho ...
        my_map=Basemap(projection='robin',lon_0=0,lat_0=0)  #'robin' ,'ortho'

        #Draw cost lines and countries
        my_map.drawcoastlines()
        my_map.drawcountries()

        #Define colours
        my_map.drawcountries(color='#303338')
        my_map.drawmapboundary(fill_color='#c0eaff')
        my_map.drawcoastlines(color='#324c87')
        my_map.fillcontinents(color='#ebe7d5', lake_color='#c0eaff')


        #Place the different locations on the map
        for i in range(len(df[pLongitud])):     
            x,y = my_map(df[pLongitud][i], df[pLatitud][i])
            my_map.plot(x,y, color='g', marker = 'o' , markersize=10 , alpha = 0.9)

        return my_map
    

In [4]:
#************************************************************************************
#***************************     GEOLOCATION Functions     **************************
#************************************************************************************

/#Function that give us information on the location of an IP adress (Country, latitude, altitude,..)
def getInfoGeolocation(x, num):
    

    
    response = DbIpCity.get(x, api_key='free')
    
    #(1) Return Name of country
    if num==1:
        exit = response.country
    #(2) Return Name of country
    if num==2:
        exit = response.latitude
    #(3) Return Name of country
    if num==3:
        exit = response.longitude
   
    #ip = str(dfLogWeb['IP'][1])
    #response = DbIpCity.get(ip, api_key='free')
    #response.ip_address
    #print(response.city)
    #print(response.region)
    #print(response.country)

    #print(response.latitude)
    #print(response.longitude)
    #print(response.to_json())
    return exit



In [5]:
#************************************************************************************
#***************************        SAMPLE Functions       **************************
#************************************************************************************
#Function for doing a Simple Random Sampling from a dataset
def simpleSampling(df, num_sample):
    
    dfSample = df.sample(num_sample)          # Select X sample points from dataset
    return dfSample                           # Return result

# Function for doing a Systematic Sampling from dataset
def systematicSampling(df, start, step):
 
    indexes = np.arange(start, len(df), step=step)
    systematic_sample = df.iloc[indexes]
    return systematic_sample

# Function for doing a Stratified Sampling from dataset (able for 3 subgroups)
def stratifiedSampling3Group(df, group, subgName1, subgName2, subgName3, samplePercentage):
 
    # STEP1: Divide de Group in subgroups (STRATA)
    group1_df = df.groupby(group)
    group1_df = group1_df.get_group(subgName1)
    nReg_group1 = len(group1_df)
#   print('Regsitros Grupo 1:', nReg_group1)
#   display(group1_df)

    group2_df = df.groupby(group)
    group2_df = group2_df.get_group(subgName2)
    nReg_group2 = len(group2_df)
#   print('Regsitros Grupo 2:', nReg_group2)
#   display(group2_df)

    group3_df = df.groupby(group)
    group3_df = group3_df.get_group(subgName3)
    nReg_group3 = len(group3_df)
#   print('Regsitros Grupo 3:', nReg_group3)
#   display(group3_df)
    
    # STEP 2: get a percentage from each group
    group11_df = group1_df.groupby(group, group_keys=False).apply(lambda x: x.sample(frac=samplePercentage))
    group12_df = group2_df.groupby(group, group_keys=False).apply(lambda x: x.sample(frac=samplePercentage))
    group13_df = group3_df.groupby(group, group_keys=False).apply(lambda x: x.sample(frac=samplePercentage))

    # STEP 3: 
    dfNew = pd.concat([group11_df, group12_df])
    dfNew = pd.concat([dfNew, group13_df])

    return dfNew

# Function for doing a Systematic Sampling from dataset
def reservoirSampling(df, k):
          
    #Import library
    import random
    
    # Create as stream generator
    #stream = dfRunners20['athlete']
    stream = df.index
    # Doing Reservoir Sampling from the stream
    #k=6
    reservoir = []
    for i, element in enumerate(stream):
        if i+1<= k:
            reservoir.append(element)
        else:
            probability = k/(i+1)
            if random.random() < probability:
                # Select item in stream and remove one of the k items already selected
                 reservoir[random.choice(range(0,k))] = element

    print(reservoir)
    print(len(reservoir))

    #Define new dataframe
    df_Sample = pd.DataFrame()

    #Create a New dataframe with values selected from reservoir sampling
    for i in range(len(reservoir)):
        num = reservoir[i]
        df_Sample = df_Sample.append(df.loc[num],ignore_index=True)
        
    return df_Sample

In [6]:
#************************************************************************************
#********************       LINEAR REGRESSION Functions          ********************
#************************************************************************************
# Function that Split data in train and test:
def split_TrainTest(df, Xparam, Yparam):
    # split a dataset into train and test sets
    from scipy.stats import pearsonr
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error
    import statsmodels.api as sm
    import statsmodels.formula.api as smf

    #Split the data into Train(70%) and Test(30%)
    X = df[Xparam]
    y = df[Yparam]

    # Split data in Train () & Test
    X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            test_size=0.3,
                                            random_state = 1)

    # Create the Model with Train & Test
    model = sm.OLS(endog=y_train, exog=X_train,)
    model = model.fit()
    print(model.summary())
    return model, X_train, X_test, y_train, y_test 


# Function that Split data in train and test:
def split_TrainTest_KNN(df, Xparam, Yparam):
    # split a dataset into train and test sets
    from scipy.stats import pearsonr
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import r2_score
    from sklearn.metrics import mean_squared_error
    import statsmodels.api as sm
    import statsmodels.formula.api as smf
    from sklearn.neighbors import KNeighborsClassifier
    
    #Split the data into Train(70%) and Test(30%)
    X = df[Xparam]
    y = df[Yparam]

    # Split data in Train () & Test
    X_train, X_test, y_train, y_test = train_test_split(
                                            X,
                                            y,
                                            test_size=0.3,
                                            random_state = 1)

    # Create the Model with Train & Test
    from sklearn.preprocessing import StandardScaler

    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    #Classifier
    classifier = KNeighborsClassifier()
    classifier.fit(X_train, y_train)
    
    #print(model.summary())
    return classifier, X_train, X_test, y_train, y_test 

In [7]:
#************************************************************************************
#***************************     Correlation Functions     **************************
#************************************************************************************
#Function for convert a correlation matrix in "tidy" format
def tidy_corr_matrix(corr_mat):

    corr_mat = corr_mat.stack().reset_index()
    corr_mat.columns = ['var_1','var_2','r']
    corr_mat = corr_mat.loc[corr_mat['var_1'] != corr_mat['var_2'], :]
    corr_mat['abs_r'] = np.abs(corr_mat['r'])
    corr_mat = corr_mat.sort_values('abs_r', ascending=False)
    
    return(corr_mat)

In [8]:
##########################################################################
##                 DATA SCIENCE: M10.T01: Exercici 1                    ##
##########################################################################
# Exercici 1: Realitza web scraping de dues de les tres pàgines web proposades utilitzant BeautifulSoup primer i Selenium després. 

# - http://quotes.toscrape.com
# - https://www.bolsamadrid.es
# - www.wikipedia.es (fes alguna cerca primer i escrapeja algun contingut)


#Import Libraries
import requests

URL = "http://quotes.toscrape.com"
page = requests.get(URL)

print(page.text)
   

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md-4">
                <p>
                
                    <a href="/login">Login</a>
                
                </p>
            </div>
        </div>
    

<div class="row">
    <div class="col-md-8">

    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="author" itempr

In [9]:
##########################################################################
##    SCARPING WITH BEAUTIFULSOUP in  "http://quotes.toscrape.com"      ##
##########################################################################
#Import Libraries
import requests

URL = "http://quotes.toscrape.com"
page = requests.get(URL)

print(page.text)
   
#Import Libraries
import requests
from bs4 import BeautifulSoup

URL = "http://quotes.toscrape.com"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

<!DOCTYPE html>
<html lang="en">
<head>
	<meta charset="UTF-8">
	<title>Quotes to Scrape</title>
    <link rel="stylesheet" href="/static/bootstrap.min.css">
    <link rel="stylesheet" href="/static/main.css">
</head>
<body>
    <div class="container">
        <div class="row header-box">
            <div class="col-md-8">
                <h1>
                    <a href="/" style="text-decoration: none">Quotes to Scrape</a>
                </h1>
            </div>
            <div class="col-md-4">
                <p>
                
                    <a href="/login">Login</a>
                
                </p>
            </div>
        </div>
    

<div class="row">
    <div class="col-md-8">

    <div class="quote" itemscope itemtype="http://schema.org/CreativeWork">
        <span class="text" itemprop="text">“The world as we have created it is a process of our thinking. It cannot be changed without changing our thinking.”</span>
        <span>by <small class="author" itempr

In [10]:
# Print the page title
soup.title

<title>Quotes to Scrape</title>

In [12]:
# Access 
soup.meta['charset']

'UTF-8'

In [13]:
soup.div.div

<div class="row header-box">
<div class="col-md-8">
<h1>
<a href="/" style="text-decoration: none">Quotes to Scrape</a>
</h1>
</div>
<div class="col-md-4">
<p>
<a href="/login">Login</a>
</p>
</div>
</div>

In [14]:
# SEARCH: by label NAME
labels = soup.find_all('a')
for label in labels:
         print(label)

<a href="/" style="text-decoration: none">Quotes to Scrape</a>
<a href="/login">Login</a>
<a href="/author/Albert-Einstein">(about)</a>
<a class="tag" href="/tag/change/page/1/">change</a>
<a class="tag" href="/tag/deep-thoughts/page/1/">deep-thoughts</a>
<a class="tag" href="/tag/thinking/page/1/">thinking</a>
<a class="tag" href="/tag/world/page/1/">world</a>
<a href="/author/J-K-Rowling">(about)</a>
<a class="tag" href="/tag/abilities/page/1/">abilities</a>
<a class="tag" href="/tag/choices/page/1/">choices</a>
<a href="/author/Albert-Einstein">(about)</a>
<a class="tag" href="/tag/inspirational/page/1/">inspirational</a>
<a class="tag" href="/tag/life/page/1/">life</a>
<a class="tag" href="/tag/live/page/1/">live</a>
<a class="tag" href="/tag/miracle/page/1/">miracle</a>
<a class="tag" href="/tag/miracles/page/1/">miracles</a>
<a href="/author/Jane-Austen">(about)</a>
<a class="tag" href="/tag/aliteracy/page/1/">aliteracy</a>
<a class="tag" href="/tag/books/page/1/">books</a>
<a cl

In [15]:
# SEARCH: by ATTRIBUTE
footer = soup.find_all(id='footer')
print(footer)

[]


In [16]:
# SEARCH: by CLASS
footer_links = soup.find_all(class_="next")
print(footer_links)

[<li class="next">
<a href="/page/2/">Next <span aria-hidden="true">→</span></a>
</li>]


In [17]:
# SEARCH: an element
print(soup.find_all('title'))                 #Search all Title
print(soup.find_all('title', limit=1))        #Search only the First title

[<title>Quotes to Scrape</title>]
[<title>Quotes to Scrape</title>]


<span style=color:green><b>WEB SCRAPING: BEAUTIFUL SOUP</u></b></span> 

Hem realitzat WEB SCRAPING amb BEAUTIFUL SOUP en la pàgina web "http://quotes.toscrape.com". Hem realitzat búsquedes per etiquetes, per id, per nom de calsse, per nomn de títol, etc.


In [18]:
##########################################################################
##    SCARPING WITH BEAUTIFULSOUP in  "http://quotes.toscrape.com"      ##
##########################################################################
#Import Libraries
import requests
from bs4 import BeautifulSoup

URL = "https://www.bolsamadrid.es"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")

In [19]:
# Print the page title
soup.title

<title>
	Bolsa de Madrid
</title>

In [20]:
soup.div.div

<div id="Idiomas"><ul><li class="mclick"><a href="/?id=ing" target="_self"> English </a></li></ul></div>

In [21]:
# SEARCH: by label NAME
labels = soup.find_all('a')
for label in labels:
         print(label)

<a href="/?id=ing" target="_self"> English </a>
<a href="/esp/BMadrid/Contacto.aspx" target="_self"> Contacto </a>
<a href="/esp/Inversores/Agenda/HorarioMercado.aspx" target="_self"> Horario Mercado </a>
<a href="/esp/aspx/Inversores/Agenda/Calendario.aspx" target="_self"> Calendario bursátil </a>
<a href="/esp/RSS.aspx" target="_self"> RSS   <img align="absmiddle" alt="RSS" border="0" src="/images/IconoRSS.png"/> </a>
<a href="/?id=esp"><img alt="Bolsa de Madrid" border="0" src="/images/Base/LogoBMadrid.gif"/></a>
<a href="https://www.bolsasymercados.es/" target="_blank"><img alt="Bolsas y Mercados Españoles" border="0" height="45" src="/images/Base/LogoBMEBlanco.png?v=Six" width="118"/></a>
<a></a>
<a href="javascript:document.forms.formBusq.submitbusq();"><span class="BtnBuscarDcha" title="Buscar"></span></a>
<a href="/?id=esp" target="_self">Inicio</a>
<a href="#" target="_self">SOBRE NOSOTROS</a>
<a href="/esp/BMadrid/BMadrid.aspx" target="_self">Bolsa de Madrid</a>
<a href="/doc

In [22]:
# SEARCH: by ATTRIBUTE
footer = soup.find_all(id='BuscadorCab')
print(footer)

[<div id="BuscadorCab">
<div>Buscador de<br/>empresas</div>
<form action="/esp/aspx/Empresas/BusqEmpresas.aspx" id="formBusq" method="get">
<input autocomplete="off" name="busq" type="text" value="nombre / ISIN / ticker"/>
<div id="divBusq" style="display: none;"></div>
<span class="BtnGris mclick"><a href="javascript:document.forms.formBusq.submitbusq();"><span class="BtnBuscarDcha" title="Buscar"></span></a></span>
</form>
</div>]


In [23]:
# SEARCH: by CLASS
footer_links = soup.find_all(class_="seg")
print(footer_links)

[<div class="seg"><a href="/?id=esp" target="_self">Inicio</a> / </div>]


In [24]:
# SEARCH: an element
print(soup.find_all('title'))                 #Search all Title
print(soup.find_all('title', limit=1))        #Search only the First title

[<title>
	Bolsa de Madrid
</title>]
[<title>
	Bolsa de Madrid
</title>]


<span style=color:green><b>WEB SCRAPING: BEAUTIFUL SOUP</u></b></span> 

Hem realitzat WEB SCRAPING amb BEAUTIFUL SOUP en la pàgina web "https://www.bolsamadrid.es". Hem realitzat búsquedes per etiquetes, per id, per nom de calsse, per nomn de títol, etc.


In [25]:
##########################################################################
##      SCARPING WITH SELENIUM in  "https://www.bolsamadrid.es"         ##
##########################################################################

#Import Libraries
from selenium import webdriver

driver = webdriver.Chrome(executable_path=r"C:\dchrome\chromedriver.exe")
driver.get("https://www.bolsamadrid.es")

title = driver.title


driver.implicitly_wait(0.5)

#search_box = driver.fromind_element

#driver.close()

In [26]:
#SEARCH: in web page by NAME
search_form=driver.find_element_by_name('busq')
search_form.send_keys('dax')
search_form.submit()

In [27]:
##########################################################################
##       SCARPING WITH SELENIUM in  "https://en.wikipedia.org/"         ##
##########################################################################

In [28]:
#Import Libraries
from selenium import webdriver

driver = webdriver.Chrome(executable_path=r"C:\dchrome\chromedriver.exe")
driver.get("https://en.wikipedia.org/")

title = driver.title


driver.implicitly_wait(0.5)

#search_box = driver.fromind_element

#driver.close()

In [29]:
#SEARCH: in web page by NAME
search_form=driver.find_element_by_id('searchInput')
search_form.send_keys('covid')
search_form.submit()

In [30]:
##########################################################################
##          SCARPING WITH SBEAUTIFULSOUP: Create New dataset            ##
##########################################################################

In [31]:

1#Import Libraries
import requests
from bs4 import BeautifulSoup
import csv
from datetime import datetime

URL = "https://www.bolsamadrid.es/esp/aspx/Indices/Resumen.aspx"
page = requests.get(URL).text

#soup = BeautifulSoup(page.content, "html.parser")
soup = BeautifulSoup(page, "lxml")
2


2

In [32]:
# Obtenemos la tabla por un ID específico
tabla = soup.find('table', attrs={'id': 'ctl00_Contenido_tblÍndices'})
tabla

<table align="Center" cellpadding="3" cellspacing="0" class="TblPort" id="ctl00_Contenido_tblÍndices" style="margin-bottom: 20px;" width="85%">
<tr align="center">
<th scope="col">Nombre</th><th scope="col">Anterior</th><th scope="col">Último</th><th scope="col">% Dif.</th><th scope="col">Máximo</th><th scope="col">Mínimo</th><th scope="col">Fecha</th><th scope="col">Hora</th><th class="Ult" scope="col">% Dif.<br/>Año 2022</th>
</tr><tr align="right">
<td align="left" class="DifFlSb">IBEX 35®</td><td>7.680,50</td><td>7.794,90</td><td class="DifClSb">1,49</td><td>7.803,60</td><td>7.675,20</td><td align="center">25/10/2022</td><td align="center">17:38:00</td><td class="DifClBj Ult">-10,55</td>
</tr><tr align="right">
<td align="left" class="DifFlSb">IBEX 35® con Dividendos</td><td>24.393,70</td><td>24.756,90</td><td class="DifClSb">1,49</td><td>24.784,50</td><td>24.376,70</td><td align="center">25/10/2022</td><td align="center">17:38:00</td><td class="DifClBj Ult">-7,89</td>
</tr><tr ali

In [33]:
#Function for Write in a file *.csv (Export Data)
def writeFileCSV(file_name, data1, data2, data3, data4, data5, data6, data7):
    
    #Import libraries
    import csv
    
    #Import library pyarrow for read files with extenison "parquet" 
    import pyarrow 
    
    with open(file_name, 'a') as csv_file:
             writer = csv.writer(csv_file)
             writer.writerow([data1, data2, data3, data4, data5, data6, data7])  
    return


# Init values
name=""
price=""
dif=""
vmax=""
vmin=""
data=""
dif_year=""
nroFila=0

# SCARPING: Search on table every value of INDEX and write in a *.csv file
for fila in tabla.find_all("tr"):
    #if nroFila==1:
        nroCelda=0
        for celda in fila.find_all('td'):
            if nroCelda==0:
                name=celda.text
                print("Indice:", name)
            if nroCelda==2:
                price=celda.text
                print("Valor:", price)
            if nroCelda==3:
                dif=celda.text
                print("% Dif:", dif)
            if nroCelda==4:
                vmax=celda.text
                print("Máxim:", vmax)  
            if nroCelda==5:
                vmin=celda.text
                print("Mínim:", vmin)        
            if nroCelda==6:
                data=celda.text
                print("Data:", data)
            if nroCelda==8:
                dif_year=celda.text
                print("% Dif. Any 2022:", dif_year)     
            nroCelda=nroCelda+1
        # Open/Create a *.csv file and added new data
        writeFileCSV('indexES_2022.csv', name, price, dif, vmax, vmin, data, dif_year)
 
        nroFila=nroFila+1

Indice: IBEX 35®
Valor: 7.794,90
% Dif: 1,49
Máxim: 7.803,60
Mínim: 7.675,20
Data: 25/10/2022
% Dif. Any 2022: -10,55
Indice: IBEX 35® con Dividendos
Valor: 24.756,90
% Dif: 1,49
Máxim: 24.784,50
Mínim: 24.376,70
Data: 25/10/2022
% Dif. Any 2022: -7,89
Indice: IBEX MEDIUM CAP®
Valor: 12.523,40
% Dif: 0,63
Máxim: 12.540,20
Mínim: 12.383,10
Data: 25/10/2022
% Dif. Any 2022: -9,35
Indice: IBEX SMALL CAP®
Valor: 6.653,70
% Dif: 1,06
Máxim: 6.658,90
Mínim: 6.578,30
Data: 25/10/2022
% Dif. Any 2022: -19,27
Indice: IBEX 35® Bancos
Valor: 500,50
% Dif: 0,50
Máxim: 506,00
Mínim: 492,60
Data: 25/10/2022
% Dif. Any 2022: 8,10
Indice: IBEX 35® Energía
Valor: 1.301,80
% Dif: 1,06
Máxim: 1.306,20
Mínim: 1.284,90
Data: 25/10/2022
% Dif. Any 2022: -3,54
Indice: IBEX 35® Construcción
Valor: 1.548,90
% Dif: 2,21
Máxim: 1.548,90
Mínim: 1.516,80
Data: 25/10/2022
% Dif. Any 2022: -2,96
Indice: IBEX Gender Equality
Valor: 7.729,80
% Dif: 0,64
Máxim: 7.736,40
Mínim: 7.658,80
Data: 25/10/2022
% Dif. Any 2022:

Indice: Índice TEF Apalancado X3
Valor: 143,00
% Dif: -2,39
Máxim: 149,70
Mínim: 141,70
Data: 25/10/2022
% Dif. Any 2022: -43,07
Indice: Índice SAN Apalancado X3
Valor: 177,90
% Dif: 1,60
Máxim: 185,30
Mínim: 168,20
Data: 25/10/2022
% Dif. Any 2022: -40,48
Indice: Índice BBVA Apalancado X3
Valor: 391,80
% Dif: 2,38
Máxim: 399,50
Mínim: 370,50
Data: 25/10/2022
% Dif. Any 2022: -16,94
Indice: Índice ITX Apalancado X3
Valor: 1.504,80
% Dif: 9,30
Máxim: 1.513,80
Mínim: 1.391,20
Data: 25/10/2022
% Dif. Any 2022: -53,50
Indice: Índice TEF Apalancado X5
Valor: 75,00
% Dif: -3,97
Máxim: 81,00
Mínim: 73,80
Data: 25/10/2022
% Dif. Any 2022: -68,55
Indice: Índice SAN Apalancado X5
Valor: 240,20
% Dif: 2,65
Máxim: 256,70
Mínim: 218,50
Data: 25/10/2022
% Dif. Any 2022: -77,24
Indice: Índice BBVA Apalancado X5
Valor: 128,30
% Dif: 3,97
Máxim: 132,40
Mínim: 116,80
Data: 25/10/2022
% Dif. Any 2022: -60,08
Indice: Índice ITX Apalancado X5
Valor: 27,60
% Dif: 15,48
Máxim: 27,90
Mínim: 24,30
Data: 25/10/

<span style=color:green><b>WEB SCRAPING AMB BEAUTIFUL SOUP: CREEM NOU DATASET</u></b></span> 

En les últimes linies hem creat un codi per accedir a la web de la BORSA de MADRID ("https://www.bolsamadrid.es) i extreure els valors dels principals INDEX Espanyols al finalitzar el dia.

La idea és que executant aquest codi diàriament a les 18:30 de la tarda quan ja ha tancat el maercat podem agafar els valors diaris dels INDEX i passat un any podríem obtenir tots els valors ANUALS.


In [34]:
##########################################################################
##                 DATA SCIENCE: M10.T01: Exercici 2                    ##
##########################################################################
# Exercici 2: Documenta en un Word el teu conjunt de dades generat amb la 
# informació que tenen els diferents arxius de Kaggle.

# Import libraries
import os
# Open Word file
r = os.system("dataset_info.docx")

In [35]:
##########################################################################
##                 DATA SCIENCE: M10.T01: Exercici 3                    ##
##########################################################################
# Exercici 3: Tria una pàgina web que tu vulguis i realitza web scraping 
# mitjançant la llibreria Selenium primer i Scrapy després. 



In [36]:
##########################################################################
##      SCARPING WITH SELENIUM in  "https://www.fotocasa.es/es/"        ##
##########################################################################

#Import Libraries
import random
from time import sleep
import time
from selenium.webdriver.common.keys import Keys
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains

driver = webdriver.Chrome(executable_path=r"C:\dchrome\chromedriver.exe")
driver.get("https://www.fotocasa.es/es/")
driver.maximize_window()
#title = driver.title


#driver.implicitly_wait(0.5)

#search_box = driver.fromind_element

#driver.close()

In [37]:
# Select the button in SEARCHER for "NEW HOMES"
time.sleep(2)
buy=driver.find_element_by_xpath('.//div[@class="re-HomeSearchSelector-item re-HomeSearchSelector-item--FILTER_CONSERVATION_STATUS_NEW_HOME"]')   
buy.click()                                 

In [38]:
# Now we write the the city in "Keys words" that we want to search
time.sleep(2)
search=driver.find_element_by_xpath('.//div[@class="sui-AtomInput--withIcon sui-AtomInput--withIcon--right"]/input')   
search.click()
search.send_keys('Sant Cugat del Vallès')
time.sleep(1)
search.send_keys(Keys.ENTER)



In [39]:
html_txt = driver.page_source   #Save html code from actual page
soup = BeautifulSoup(html_txt)

# Init list
list_Homes = []

homes = soup.find_all('div',class_="re-CardPackAdvance-info")
for home in homes:
    info=home.find('span').getText()  #Search info
    list_Homes.append(info)
    print(info)
print(list_Homes)

Obra Nueva  en Calle Benet Cortada, 33, Volpelleres 
Obra Nueva  en Avenida de la Via Augusta, 144, Volpelleres 
Ver contenido
['Obra Nueva  en Calle Benet Cortada, 33, Volpelleres ', 'Obra Nueva  en Avenida de la Via Augusta, 144, Volpelleres ', 'Ver contenido']


In [None]:
##########################################################################
##        SCARPING WITH SCRAPY in  "https://www.bolsamadrid.es"         ##
##########################################################################


# Import Libraries
import scrapy


class QuotessSpider(scrapy.Spider):
    name = "quotesSp"                       # Define the name of our Crawler
    allowed_domains = ['quotes.toscrape.com']
    start_urls = ["https://quotes.toscrapte.com"]
    
    def parse(self, response):

        for quote in response.xpath('//div[@class="quote"]'):
                yield{
                    'text' : quote.xpath('./span[@class="text"]/text()').extract.first(),
                    'author' : quote.xpath('.//small[@class="author"]/text()').extract.first(),
                    'tags' : quote.xpath('./div[@class="tags"]/a[@class="tag"]/text()').extract()
                } 
        next_page_url = response.xpath('//li[@class="next"]/a/@href').extract.first()
        if next_page_url is not None:
                    yield scrapy.Request(response.urljoin(next_page_url))