In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import io
import os
import sqlite3

In [2]:
def extract_features(bs_content, contextRef, fund_id):
    
    #General Description
    try:
        general = bs_content.find_all('CategoriaFIM')[0].text.split('\n')
        fund_type = general[1]
        fund_focus = general[2]
        fund_risk = general[3]    
    except:
        fund_type = np.nan
        fund_focus = np.nan
        fund_risk = np.nan        
    
    #AUM
    try:
        all_aum = bs_content.find_all('Patrimonio', {'contextRef': contextRef + '_ia'})
        aum = sum([float(i.text) for i in all_aum])
    except:
        aum = np.nan
    
    #Shareholders
    try:
        all_sh = bs_content.find_all('NumeroParticipes', {'contextRef': contextRef + '_ia'})
        sh = sum([float(i.text) for i in all_sh])    
    except:
        sh = np.nan  
    
    #Fees
    try:
        all_fees = bs_content.find_all('ComisionGestionCobrada', {'contextRef': contextRef + '_daa'})
        fees = np.mean([float(i.text) for i in all_fees])
    except:
        fees = np.nan    
    
    #Portfolio Rotation
    try:
        all_port_rot = bs_content.find_all('IndiceRotacionCartera', {'contextRef': contextRef + '_da'})
        port_rot = np.mean([float(i.text) for i in all_port_rot])
    except:
        port_rot = np.nan
    
    #Investment Policy
    try:
        inv_pol = bs_content.find_all('PoliticaInversion')[0].text.replace('\n', '').encode('latin-1').decode('utf-8')
    except:
        inv_pol = bs_content.find_all('PoliticaInversion')[0].text.replace('\n', '')
    
    df_features = pd.DataFrame(data={'Fund ID': fund_id, 'Fund Type': fund_type, 'Fund Focus': fund_focus, 'Fund Risk': fund_risk,
                                     'AUM': aum, 'Fees': fees, 'Rotation': port_rot, 'Inv Policy': inv_pol}, index=[0])
    
    return df_features

In [3]:
Global_df = pd.read_excel('CNMV_Global_Scrape.xlsx')
Global_df.head(10)

Unnamed: 0,Type,AM Name,AM Code,Global Data URL,Fund Name,Fund Code,Fund Global URL,Fund ID
0,IIC,"360 CORA SGIIC, S.A.",276,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"CODEX GLOBAL FUND, FI",5461.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,276-5461
1,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"A&G RENTA FIJA CORTO PLAZO, FI",3989.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-3989
2,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"GLOBAL MANAGERS FUNDS, FI",3072.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-3072
3,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"GREDOS BOLSA EURO, FI",4881.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-4881
4,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"GREDOS BOLSA INTERNACIONAL, FI",4883.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-4883
5,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"GREDOS MODERADO, FI",4882.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-4882
6,IIC,"A&G FONDOS, SGIIC, SA",195,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"GREDOS RENTA FIJA, FI",5389.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,195-5389
7,IIC,"ABACO CAPITAL, SGIIC, S.A.",238,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,ABACO GLOBAL VALUE OPPORTUNITIES FI,4827.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,238-4827
8,IIC,"ABACO CAPITAL, SGIIC, S.A.",238,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"ABACO RENTA FIJA MIXTA GLOBAL, FI",4474.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,238-4474
9,IIC,"ABANTE ASESORES GESTION, SGIIC, S.A.",194,https://www.cnmv.es/Portal/Consultas/IIC/SGIIC...,"ABANTE ASESORES GLOBAL, FI",2562.0,https://www.cnmv.es/Portal/Consultas/IIC/Fondo...,194-2562


In [None]:
list_files = os.listdir('Info Pública')
list_files.remove('Funds DB')

all_prts = pd.DataFrame(columns=['ISIN', 'Activo', 'Descripción', 'Peso Actual', 'Peso Anterior', 'Fund ID'])

for i_file in list_files:
    
    try:
        #Convert xml format to Beautifulsoup
        content = []
        # Read the XML file        
        with open('Info Pública/' + i_file, 'rb') as file:
            i_bs_content = BeautifulSoup(file.read(), 'xml')        

        #contextRef is an attribute of each file
        #adding '_ia' makes reference to current value and '_ipp' makes reference to past value   
        i_contextRef = i_bs_content.find('context')['id'].replace('_da','')    

        i_port = extract_portfolio(i_bs_content, i_contextRef, i_file.replace('.xml',''))

        all_prts = pd.concat([all_prts, i_port], ignore_index=True)
    
    except:
        continue
    
all_prts.to_pickle('portfolios.pickle')    
all_prts.head(10)

In [None]:
list_classes

In [4]:
list_files = os.listdir('Info Pública')
list_files.remove('Funds DB')
list_files

['103-1081.xml',
 '103-1230.xml',
 '103-15.xml',
 '103-2269.xml',
 '103-377.xml',
 '103-4290.xml',
 '103-4422.xml',
 '103-502.xml',
 '103-5172.xml',
 '103-5279.xml',
 '103-5330.xml',
 '103-5417.xml',
 '103-5467.xml',
 '103-594.xml',
 '103-6.xml',
 '103-75.xml',
 '105-4869.xml',
 '105-4991.xml',
 '105-5040.xml',
 '105-5049.xml',
 '105-5087.xml',
 '105-5105.xml',
 '105-5278.xml',
 '105-5309.xml',
 '105-5338.xml',
 '105-5392.xml',
 '105-5401.xml',
 '105-5426.xml',
 '105-5437.xml',
 '105-5438.xml',
 '105-5439.xml',
 '105-5440.xml',
 '105-5441.xml',
 '105-5447.xml',
 '113-1067.xml',
 '113-1083.xml',
 '113-1110.xml',
 '113-1115.xml',
 '113-1377.xml',
 '113-1536.xml',
 '113-1777.xml',
 '113-1786.xml',
 '113-2222.xml',
 '113-295.xml',
 '113-3653.xml',
 '113-3753.xml',
 '113-3825.xml',
 '113-399.xml',
 '113-4160.xml',
 '113-4229.xml',
 '113-4383.xml',
 '113-4470.xml',
 '113-470.xml',
 '113-4718.xml',
 '113-4789.xml',
 '113-504.xml',
 '113-5090.xml',
 '113-5104.xml',
 '113-544.xml',
 '113-722.xm

In [5]:
i_fund = list_files[0].replace('.xml','')
i_fund

'103-1081'

In [6]:
i_fund = '12-4252'

In [None]:
#Get in the Fund Display and go to the tab 'Clases de participaciones sin compartimentos'
soup = BeautifulSoup(requests.get(Global_df[Global_df['Fund ID'].isin([i_fund])]['Fund Global URL'].values[0]).content, 'lxml')
soup_link_fund = soup.find('div', {'class': 'NavegApdo'}).find_all('a', href=True)

for i_links in soup_link_fund:
    if i_links.text=='Clases de participaciones sin compartimentos':
        link_isins = i_links['href']

#SI SOLO HAY UNA CLASE PETA AQUI!!!!!!!!!!!!!!!!!!!!!!!!        
link_isins = '/'.join(Global_df[Global_df['Fund ID'].isin([i_fund])]['Global Data URL'].values[0].split('/')[:-1]) + '/' + link_isins        
        
#Once inside the tab 'Clases de participaciones sin compartimentos', get the ISIN of each class 
soup = BeautifulSoup(requests.get(link_isins).content, 'lxml')
list_classes = [i.text for i in soup.find_all('td', {'data-th': 'Denominación'})]
list_isins = [i.text for i in soup.find_all('td', {'data-th': 'ISIN'})]

In [None]:
list_isins