# Transforming Data

In [15]:
# Import Raw Data
import pandas as pd
scrapped_data = pd.read_csv('datasets/lq45_financial_data_raw.csv')
scrapped_data.head()

Unnamed: 0,symbol,account,type,2019,2020,2021,2022,2023
0,ACES,Accounts Payable,BS,,164227200000.0,126723800000.0,144350100000.0,125623100000.0
1,ACES,Accounts Receivable,BS,,143482700000.0,66777640000.0,85854930000.0,198609800000.0
2,ACES,Accumulated Depreciation,BS,,-839618000000.0,-1867399000000.0,-1921765000000.0,-2003892000000.0
3,ACES,Additional Paid In Capital,BS,,440574900000.0,440574900000.0,440574900000.0,440574900000.0
4,ACES,Allowance For Doubtful Accounts Receivable,BS,,-55110070.0,-259305600.0,-3957857.0,-17999640.0


## Drop unused rows and columns

In [16]:
scrapped_data = scrapped_data.drop(columns=['2019'])

### Choose relevant rows

Drop all rows from df account, except:
- Total Revenue
- Net Income
- Total Assets
- Total Liabilities Net Minority Interest
- Cash Flowsfromusedin Operating Activities Direct
- Investing Cash Flow
- Financing Cash Flow

In [17]:
kept_values = [
    'Total Revenue',
    'Net Income',
    'Total Assets',
    'Total Liabilities Net Minority Interest',
    'Cash Flowsfromusedin Operating Activities Direct',
    'Investing Cash Flow',
    'Financing Cash Flow'
]

cleaned_data = scrapped_data[scrapped_data['account'].isin(kept_values)]

In [18]:
cleaned_data['account'] = cleaned_data['account'].replace(
    'Cash Flowsfromusedin Operating Activities Direct', 
    'Operating Cash Flow'
)

cleaned_data['account'] = cleaned_data['account'].replace(
    'Total Liabilities Net Minority Interest', 
    'Total Liabilities'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['account'] = cleaned_data['account'].replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['account'] = cleaned_data['account'].replace(


### Table Transformation

In [19]:
#  Melt table to long format
df_long = pd.melt(cleaned_data, id_vars=['symbol', 'account', 'type'], var_name='Year', value_name='Value')

# Pivot tabel to get desired format
df_pivot = df_long.pivot(index=['symbol', 'Year'], columns='account', values='Value').reset_index()

# Change column names
df_pivot.columns.name = None  # Delete col name
df_pivot = df_pivot.rename_axis(None, axis=1)

In [20]:
# Convert all numerical columns to float64
for col in ['Financing Cash Flow', 'Investing Cash Flow', 'Operating Cash Flow', 'Net Income', 'Total Revenue', 'Total Assets', 'Total Liabilities']:
    df_pivot[col] = pd.to_numeric(df_pivot[col], errors='coerce')

### Add additional columns

In [21]:
# Add Liabilities to Assets % Column
df_pivot['Liabilities to Assets %'] = (df_pivot['Total Liabilities'] / df_pivot['Total Assets']) * 100

# Add Net Margin % Column
df_pivot['Net Margin %'] = (df_pivot['Net Income'] / df_pivot['Total Revenue']) * 100

In [22]:
# Order Columns for better readibility
ordered_columns = ['symbol', 'Year', 'Financing Cash Flow', 'Investing Cash Flow', 'Operating Cash Flow', 'Total Revenue', 'Net Income', 'Net Margin %', 'Total Assets', 'Total Liabilities', 'Liabilities to Assets %']
df_pivot = df_pivot[ordered_columns]

In [23]:
df_pivot

Unnamed: 0,symbol,Year,Financing Cash Flow,Investing Cash Flow,Operating Cash Flow,Total Revenue,Net Income,Net Margin %,Total Assets,Total Liabilities,Liabilities to Assets %
0,ACES,2020,-4.050379e+11,-1.659800e+11,1.538615e+12,7.412767e+12,7.331955e+11,9.890982,7.216725e+12,1.873127e+12,25.955363
1,ACES,2021,-8.978434e+11,-8.337988e+10,1.305426e+12,6.543363e+12,6.907705e+11,10.556812,7.171138e+12,1.592158e+12,22.202307
2,ACES,2022,-9.394425e+11,-1.023613e+11,6.188384e+11,6.762803e+12,6.643429e+11,9.823483,7.249255e+12,1.315266e+12,18.143465
3,ACES,2023,-1.041742e+12,-1.306747e+11,1.356430e+12,7.611866e+12,7.635075e+11,10.030490,7.753269e+12,1.566872e+12,20.209172
4,ADRO,2020,-7.779670e+08,-3.616570e+08,7.387530e+08,2.534842e+09,1.469270e+08,5.796298,6.381566e+09,2.429852e+09,38.076109
...,...,...,...,...,...,...,...,...,...,...,...
175,UNTR,2023,-1.223004e+13,-3.343977e+13,2.634685e+13,1.285833e+14,2.061178e+13,16.029905,1.540282e+14,6.999268e+13,45.441460
176,UNVR,2020,-7.458509e+12,-6.902160e+11,8.363993e+12,4.297247e+13,7.163536e+12,16.670057,2.053463e+13,1.559726e+13,75.955897
177,UNVR,2021,-7.739754e+12,-6.812550e+11,7.902091e+12,3.954596e+13,5.758148e+12,14.560648,1.906853e+13,1.474726e+13,77.338219
178,UNVR,2022,-7.357788e+12,-5.260630e+11,8.061314e+12,4.121888e+13,5.364761e+12,13.015300,1.831811e+13,1.432086e+13,78.178671


### Transforming Data to Star Schema