# Aula Interativa 1

1. Coletar histórico de cotas de fundos da CVM;

2. Armazenar os dados brutos no data lake;

3. Processar os dados brutos e armazenar os dados de interesse no banco de dados MySQL;

4. Ler os dados processados em Python;

5. Rodar uma análise de risco de fatores.

## 1. Coletar histórico de cotas de fundos da CVM

In [None]:
import pandas as pd

In [None]:
import requests_html

In [None]:
url = "https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_202303.zip"

In [None]:
nome_arquivo = "inf_diario_fi_202303.csv"

In [None]:
r = requests_html.HTMLSession().get(url)

In [None]:
import zipfile
import io

In [None]:
zf = zipfile.ZipFile(io.BytesIO(r.content))

In [None]:
zf = zf.open(nome_arquivo)

In [None]:
list_ = [x.decode().split(';') for x in zf.readlines()]

In [None]:
list_

In [None]:
list_[0]

In [None]:
df_fundos = pd.DataFrame(list_[1:], columns=list_[0])

In [None]:
df_fundos

In [None]:
for column in df_fundos.columns:
    df_fundos = df_fundos.rename(
        {
            column: column.split()[0]
        },
        axis="columns"
    )

In [None]:
df_fundos

In [None]:
df_fundos["NR_COTST"] = [x.split()[0] for x in df_fundos["NR_COTST"]]
df_fundos

In [None]:
df_fundos.to_csv("inf_diario_fi_202303.csv")

## 2. Armazenar os dados brutos no data lake

In [None]:
!pip install azure-storage-blob

In [None]:
from azure.storage.blob import BlobServiceClient

storage_account_key = ""
storage_account_name = ""
connection_string = ""
container_name = "xp-crv/bronze/cvm"

def uploadToBlobStorage(file_path, file_name):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
    with open(file_path, "rb") as data:
        blob_client.upload_blob(data)
        print(f"Uploaded {file_name}.")

def downloadFromBlobStorage(file_path, file_name):
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    blob_client = blob_service_client.get_blob_client(container=container_name, blob=file_name)
    with open(file_path, "wb") as f:
        data = blob_client.download_blob()
        data.readinto(f)        


In [None]:
def coleta_dados_fundos_cvm(dt_ref):
    
    url = f"https://dados.cvm.gov.br/dados/FI/DOC/INF_DIARIO/DADOS/inf_diario_fi_{dt_ref}.zip"
    nome_arquivo = f"inf_diario_fi_{dt_ref}.csv"
    
    r = requests_html.HTMLSession().get(url)
    
    zf = zipfile.ZipFile(io.BytesIO(r.content))
    zf = zf.open(nome_arquivo)
    list_ = [x.decode().split(';') for x in zf.readlines()]
    
    df_fundos = pd.DataFrame(list_[1:], columns=list_[0])
   
    for column in df_fundos.columns:
        df_fundos = df_fundos.rename(
            {
                column: column.split()[0]
            },
            axis="columns"
        )
        
    df_fundos["NR_COTST"] = [x.split()[0] for x in df_fundos["NR_COTST"]]

    df_fundos.to_csv(f"fundos/inf_diario_fi_{dt_ref}.csv")

In [None]:
list_dt_ref = [
    "202303",
    "202302",
    "202301",
    "202212",
    "202211",
    "202210",
    "202209",
    "202208",
    "202207",
    "202206",
    "202205",
    "202204",
    "202203",
]

for dt_ref in list_dt_ref:
    coleta_dados_fundos_cvm(dt_ref)
    uploadToBlobStorage(f'fundos/inf_diario_fi_{dt_ref}.csv', f'inf_diario_fi_{dt_ref}.csv')

In [None]:
for dt_ref in list_dt_ref:
    downloadFromBlobStorage(f'fundos_download/inf_diario_fi_{dt_ref}.csv', f'inf_diario_fi_{dt_ref}.csv')

## 3. Processar os dados brutos e armazenar os dados de interesse no banco de dados MySQL

In [None]:
df_fundo_acoes = pd.DataFrame()

for dt_ref in list_dt_ref:
    df_fundos = pd.read_csv(f'fundos_download/inf_diario_fi_{dt_ref}.csv')
    df_temp = df_fundos[df_fundos["CNPJ_FUNDO"] == "11.145.320/0001-56"]
    df_fundo_acoes = pd.concat([df_fundo_acoes, df_temp])

In [None]:
df_fundo_acoes

In [None]:
df_fundo_acoes = df_fundo_acoes.sort_values(by="DT_COMPTC", ascending=True)

In [None]:
df_fundo_acoes

In [None]:
df_fundo_acoes["pct_change"] = df_fundo_acoes["VL_QUOTA"].pct_change()

In [None]:
df_fundo_acoes

In [None]:
df_fundo_acoes = df_fundo_acoes.reset_index(drop=True)

In [None]:
df_fundo_acoes = df_fundo_acoes.rename(
    {
        "DT_COMPTC": "Date"
    },
    axis="columns"
)

In [None]:
df_fundo_acoes.index = pd.to_datetime(df_fundo_acoes["Date"])

In [None]:
df_fundo_acoes

In [None]:
df_fundo_acoes.index = [x.strftime("%Y-%m-%d") for x in df_fundo_acoes.index]

In [None]:
df_fundo_acoes

In [None]:
from sqlalchemy import create_engine

In [None]:
con = create_engine(
    f'mysql+pymysql://root:{pwd}@localhost:3306/dados_mercado'
)

In [None]:
df_fundo_acoes.to_sql(
    name="fundos_acoes",
    con=con,
    if_exists='replace',
    index=False
)

In [None]:
df_fundo_acoes

## 4. Ler os dados processados em Python

In [None]:
df_fundo_acoes_sql = pd.read_sql("SELECT * FROM fundos_acoes", con=con)

In [None]:
df_fundo_acoes_sql

## 5. Rodar uma análise de risco de fatores

In [None]:
import yfinance as yf

In [None]:
BVSP = yf.Ticker("^BVSP")
df_bvsp = BVSP.history(period="1y", interval="1d")

In [None]:
df_bvsp

In [None]:
df_bvsp["pct_change_bvsp"] = df_bvsp["Close"].pct_change()

In [None]:
df_bvsp.index = pd.to_datetime(df_bvsp.index)

In [None]:
df_bvsp.index = [x.strftime("%Y-%m-%d") for x in df_bvsp.index]

In [None]:
df_bvsp

In [None]:
df_final = df_fundo_acoes.join(df_bvsp)

In [None]:
df_final

In [None]:
df_final = df_final[
    [
        "pct_change",
        "pct_change_bvsp"
    ]
]

In [None]:
df_final

In [None]:
df_final = df_final.dropna(axis=0)
df_final

In [None]:
!pip install statsmodels

In [None]:
# Importing libraries and packages
import statsmodels.api as sm
from statsmodels import regression

# Regression model
X = df_final["pct_change_bvsp"].values
Y = df_final["pct_change"].values

def linreg(x,y):
    x = sm.add_constant(x)
    model = regression.linear_model.OLS(y,x).fit()

    # We are removing the constant
    x = x[:, 1]
    return model.params[0], model.params[1]

alpha, beta = linreg(X,Y)
print('alpha: ' + str(alpha))
print('beta: ' + str(beta))

In [None]:
!pip install matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Plotting
X2 = np.linspace(X.min(), X.max(), 100)
Y_hat = X2 * beta + alpha
plt.figure(figsize=(10,7))
plt.scatter(X, Y, alpha=0.3) # Plot the raw data
plt.xlabel("BOVESPA Daily Return")
plt.ylabel("ATMOS Daily Return")
plt.plot(X2, Y_hat, 'r', alpha=0.9)
plt.show()

In [None]:
r = np.corrcoef(X, Y)

In [None]:
r