In [14]:
import pandas as pd
import numpy as np
from pyesef.const import PATH_PROJECT_ROOT, CSV_SEPARATOR
from pyesef.helpers.read_facts import _get_statement_item_group, _get_is_total

In [15]:
def char_len(x, fixed_n):
    '''set string x to fixed_n character, prepend with 'xxx' if short'''
    if isinstance(x, float):
        return x

    if len(x) > fixed_n: 
        return x[:fixed_n] 
    elif len(x) < fixed_n: 
        return ' ' * (fixed_n - len(x)) + x 
    return x
 

### Load and cleanup data

Unnamed: 0,period_end,statement_type,has_resolved_group,is_extension,is_total,statement_item_group,xml_name,label,membership,currency,value,lei,legal_name
0,2021-12-31,Statementofcomprehensiveincomeprofitorlossbyfu...,True,False,False,Revenue,Revenue,Revenue,,EUR,3935000000,213800D9O7FUQDH83V62,Valmet Oyj
1,2020-12-31,Statementofcomprehensiveincomeprofitorlossbyfu...,True,False,False,Revenue,Revenue,Revenue,,EUR,3740000000,213800D9O7FUQDH83V62,Valmet Oyj
2,2021-12-31,Statementofcomprehensiveincomeprofitorlossbyfu...,True,False,False,CostOfSales,CostOfSales,Cost of sales,,EUR,-2943000000,213800D9O7FUQDH83V62,Valmet Oyj
3,2020-12-31,Statementofcomprehensiveincomeprofitorlossbyfu...,True,False,False,CostOfSales,CostOfSales,Cost of sales,,EUR,-2844000000,213800D9O7FUQDH83V62,Valmet Oyj
4,2021-12-31,Statementofcomprehensiveincomeprofitorlossbyfu...,False,False,True,,GrossProfit,Gross profit,,EUR,992000000,213800D9O7FUQDH83V62,Valmet Oyj
...,...,...,...,...,...,...,...,...,...,...,...,...,...
344,2020-12-31,StatementoffinancialpositioncurrentnoncurrentS...,False,False,True,,Equity,Equity,HedgeAndOtherReservesMember,EUR,21000000,213800D9O7FUQDH83V62,Valmet Oyj
345,2020-12-31,StatementoffinancialpositioncurrentnoncurrentS...,False,False,True,,Equity,Equity,RetainedEarningsMember,EUR,633000000,213800D9O7FUQDH83V62,Valmet Oyj
346,2020-12-31,StatementoffinancialpositioncurrentnoncurrentS...,False,False,True,,Equity,Equity,EquityAttributableToOwnersOfParentMember,EUR,1137000000,213800D9O7FUQDH83V62,Valmet Oyj
347,2020-12-31,StatementoffinancialpositioncurrentnoncurrentS...,False,False,True,,Equity,Equity,NoncontrollingInterestsMember,EUR,6000000,213800D9O7FUQDH83V62,Valmet Oyj


In [20]:
df = pd.read_csv(f"{PATH_PROJECT_ROOT}/output.csv", sep=CSV_SEPARATOR)

df['statement_item_group'] = df.apply(lambda row: _get_statement_item_group(xml_name=row['xml_name']), axis=1)
df['is_total'] = df.apply(lambda row: _get_is_total(xml_name=row['xml_name']), axis=1)
df['legal_name'] = df['legal_name'].apply(lambda x: char_len(x, 40))

# Remove non-unique values
df = df.drop_duplicates(subset=df.columns)

# Tidy up formatting of values
df["value"] = df["value"] / 1e6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['statement_item_group'] = df.apply(lambda row: _get_statement_item_group(xml_name=row['xml_name']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_total'] = df.apply(lambda row: _get_is_total(xml_name=row['xml_name']), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['leg

### Check column names to group

In [21]:
#df.query(
#    'statement_item_group.str.contains("Cost")'
#).statement_item_group.unique()

#### Filter

In [22]:
filtered_df = df.query(
    'period_end in ("2021-12-31")'
    #' & statement_item_group in ("Revenue", "CashAndCashEquivalents", "CostOfSales", "Assets")'
    ' & lei=="2138001H6FCSZBP26351"'
    ' & is_total == False'
    ' & statement_type != "other_comprehensive_income"'
    # We don't need this information
    ' & local_name not in ("ProfitLossAttributableToOwnersOfParent", "ProfitLossAttributableToNoncontrollingInterests")'
)


UndefinedVariableError: name 'local_name' is not defined

In [23]:
table = pd.pivot_table(
    filtered_df,
    values='value',
    index=['lei', "legal_name", "period_end", "statement_type", "statement_item_group", "local_name"],
    #columns=['statement_item_group'],
    aggfunc=np.sum
).reset_index()

# Ratios
#table["gross_margin"] = 1-table["CostOfSales"] / table["Revenue"]
#table["cash_to_ta"] = table["CashAndCashEquivalents"] / table["Assets"]
t = table.sort_values(by=['statement_type', "statement_item_group"], ascending=False, na_position='last')
print(t.to_markdown())


NameError: name 'filtered_df' is not defined

### Analyse count per item name

In [8]:
a = filtered_df.groupby(["statement_type", "local_name"])["lei"].count().reset_index(name='count').sort_values(['count'], ascending=False)
print(a.to_markdown())

|    | statement_type      | local_name                                                                |   count |
|---:|:--------------------|:--------------------------------------------------------------------------|--------:|
| 13 | balance_sheet       | EquipmentToolsAndInstallation                                             |       2 |
|  0 | balance_sheet       | AccrualsAndDeferredIncome                                                 |       1 |
| 25 | balance_sheet       | NoncurrentPortionOfNoncurrentLoansReceived                                |       1 |
| 27 | balance_sheet       | OtherCurrentBorrowingsAndCurrentPortionOfOtherNoncurrentBorrowings        |       1 |
| 28 | balance_sheet       | OtherCurrentFinancialAssets                                               |       1 |
| 29 | balance_sheet       | OtherCurrentFinancialLiabilities                                          |       1 |
| 30 | balance_sheet       | OtherLongtermProvisions                            