In [1]:
import pandas as pd
import numpy as np
import os
import io
import sqlite3
from bs4 import BeautifulSoup

In [2]:
def extract_features(bs_content, contextRef, fund_id):
    
    #General Description
    try:
        general = bs_content.find_all('CategoriaFIM')[0].text.split('\n')
        fund_type = general[1]
        fund_focus = general[2]
        fund_risk = general[3]    
    except:
        fund_type = np.nan
        fund_focus = np.nan
        fund_risk = np.nan        
    
    #AUM
    try:
        all_aum = bs_content.find_all('Patrimonio', {'contextRef': contextRef + '_ia'})
        aum = sum([float(i.text) for i in all_aum])
    except:
        aum = np.nan
    
    #Shareholders
    try:
        all_sh = bs_content.find_all('NumeroParticipes', {'contextRef': contextRef + '_ia'})
        sh = sum([float(i.text) for i in all_sh])    
    except:
        sh = np.nan  
    
    #Fees
    try:
        all_fees = bs_content.find_all('ComisionGestionCobrada', {'contextRef': contextRef + '_daa'})
        fees = np.mean([float(i.text) for i in all_fees])
    except:
        fees = np.nan    
    
    #Portfolio Rotation
    try:
        all_port_rot = bs_content.find_all('IndiceRotacionCartera', {'contextRef': contextRef + '_da'})
        port_rot = np.mean([float(i.text) for i in all_port_rot])
    except:
        port_rot = np.nan
    
    #Investment Policy
    try:
        inv_pol = bs_content.find_all('PoliticaInversion')[0].text.replace('\n', '').encode('latin-1').decode('utf-8')
    except:
        inv_pol = bs_content.find_all('PoliticaInversion')[0].text.replace('\n', '')
    
    df_features = pd.DataFrame(data={'Fund ID': fund_id, 'Fund Type': fund_type, 'Fund Focus': fund_focus, 'Fund Risk': fund_risk,
                                     'AUM': aum, 'Fees': fees, 'Rotation': port_rot, 'Inv Policy': inv_pol}, index=[0])
    
    return df_features

In [3]:
list_files = os.listdir('Info Pública')
list_files.remove('Funds DB')

all_prts = pd.DataFrame(columns=['Fund ID', 'Fund Type', 'Fund Focus', 'Fund Risk', 'AUM', 'Fees', 'Rotation', 'Inv Policy'])

for i_file in list_files:
    
    try:
        #Convert xml format to Beautifulsoup
        content = []
        # Read the XML file        
        with open('Info Pública/' + i_file, 'rb') as file:
            i_bs_content = BeautifulSoup(file.read(), 'xml')        

        #contextRef is an attribute of each file
        #adding '_ia' makes reference to current value and '_ipp' makes reference to past value   
        i_contextRef = i_bs_content.find('context')['id'].replace('_da','')    

        i_port = extract_features(i_bs_content, i_contextRef, i_file.replace('.xml',''))

        all_prts = pd.concat([all_prts, i_port], ignore_index=True)
    
    except:
        continue

    
all_prts.to_pickle('portfolios_features.pickle')    
all_prts.head(10)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,Fund ID,Fund Type,Fund Focus,Fund Risk,AUM,Fees,Rotation,Inv Policy
0,103-1081,8.0,7.0,Alto,52655880.0,1.5,0.7,FI RENTA VARIABLE MIXTA INTERNACIONALSe invert...
1,103-1230,8.0,9.0,Alto,1398579000.0,1.75,0.71,FI RENTA VARIABLE INTERNACIONALEl objetivo de ...
2,103-15,,,,228185400.0,7.54,,El objetivo del fondo es obtener rentabilidad ...
3,103-2269,8.0,4.0,4 en una escala del 1 al 7,127879700.0,1.0,0.29,FI RENTA FIJA MIXTA EUROSe invertirï¿½ hasta e...
4,103-377,8.0,9.0,Alto,1628275000.0,1.75,0.76,"FI, RENTA VARIABLE INTERNACIONAL.El objetivo d..."
5,103-4290,8.0,9.0,Alto,394530600.0,1.75,0.82,FI RENTA VARIABLE INTERNACIONALEl objetivo de ...
6,103-4422,8.0,9.0,Alto,100248600.0,1.5,0.39,FI RENTA VARIABLE INTERNACIONALAl menos el 75%...
7,103-502,8.0,8.0,Alto,158092400.0,1.75,1.18,"FI, RENTA VARIABLE EUROLas inversiones del fon..."
8,103-5172,8.0,9.0,6,12320870.0,2.55,1.78,La gestiï¿½n toma como referencia la rentabili...
9,103-5279,8.0,20.0,1,175638100.0,0.15,0.0,RENTA FIJA EUROEl fondo invierte el 100% de la...


In [4]:
con = sqlite3.connect('Portfolios_Features.db')
cur = con.cursor()
cur.execute('DROP TABLE IF EXISTS Portfolios_Features')
cur.execute('CREATE TABLE Portfolios_Features (Fund_ID TEXT, Fund_Type TEXT, Fund_Focus TEXT, Fund_Risk TEXT, AUM REAL, Fees REAL, Rotation REAL, Inv_Policy TEXT)')

records = list(all_prts.to_records(index=False))

cur.executemany('INSERT INTO Portfolios_Features VALUES (?, ?, ?, ?, ?, ?, ?, ?)', records)

con.commit()

con.close()