-
Notifications
You must be signed in to change notification settings - Fork 0
/
cvm.py
120 lines (95 loc) · 3.8 KB
/
cvm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from brdata.core.crawler import Crawler
from io import BytesIO
from zipfile import ZipFile
import pandas as pd
VALID_PREFIXES = ["DFP", "FCA", "FRE", "IPE", "ITR"]
class CVMCrawler(Crawler):
"""Crawler for CVM data.
Example:
```python
import brdata
crawler = brdata.CVMCrawler()
crawler.get_documents("DFP", 2018, 2020)
```
"""
def __init__(self):
super().__init__("http://dados.cvm.gov.br/dados/")
def _get_table_links(
self, prefix: str, extension: str = ".zip", enable_cache: bool = True
):
if prefix not in VALID_PREFIXES:
raise ValueError(f"{prefix} is not a valid name")
page = self.get_page_soup(
path=f"CIA_ABERTA/DOC/{prefix}/DADOS/", enable_cache=enable_cache
)
valid_links = []
for link_elem in page.find_all("a"):
link = link_elem.get("href")
if link[-4:] == extension:
valid_links.append(link)
return {zip_url[-8:-4]: zip_url for zip_url in valid_links}
def get_documents_by_year(self, prefix: str, year: str, enable_cache: bool = True):
"""Get all documents for a given year.
Args:
prefix (str): One of the valid prefixes. See `VALID_PREFIXES`.
year (str): Year to get documents from.
enable_cache (bool, optional): Whether to use cache or not. Defaults to True.
Raises:
ValueError: If prefix is not valid.
ValueError: If year is not valid.
Returns:
dict: Dictionary of pandas.DataFrame with the documents.
"""
year = str(year)
prefix = prefix.upper()
links = self._get_table_links(prefix, enable_cache=enable_cache)
if year not in links:
raise ValueError(f"{year} is not a valid year")
response = self.get_response(
path=f"CIA_ABERTA/DOC/{prefix}/DADOS/{links[year]}",
enable_cache=enable_cache,
)
all_dfs = {}
with ZipFile(BytesIO(response.content)) as zip:
filenames = zip.namelist()
for filename in filenames:
if filename.endswith(".csv"):
final_name = filename[: -len("_XXXX.csv")]
try:
df = pd.read_csv(
zip.open(filename), delimiter=";", encoding="latin1"
)
df["year"] = year
all_dfs[final_name] = df
except Exception as e:
print("Invalid file", final_name, e)
return all_dfs
def get_documents(
self, prefix: str, start_year: str, end_year: str, enable_cache: bool = True
):
"""Get all documents for a given period.
Args:
prefix (str): One of the valid prefixes. See `VALID_PREFIXES`.
start_year (str): Year to start getting documents from.
end_year (str): Year to end getting documents from.
enable_cache (bool, optional): Whether to use cache or not. Defaults to True.
Raises:
ValueError: If prefix is not valid.
ValueError: If start_year is not valid.
ValueError: If end_year is not valid.
Returns:
dict: Dictionary of pandas.DataFrame with the documents.
"""
all_data = []
for year in range(int(start_year), int(end_year) + 1):
all_data.append(
self.get_documents_by_year(prefix, year, enable_cache=enable_cache)
)
output = {}
for data in all_data:
for key in data:
if key not in output:
output[key] = data[key]
else:
output[key] = pd.concat([output[key], data[key]])
return output