# Transforming Data

In [6]:
# Import Raw Data
import pandas as pd
scrapped_data = pd.read_csv('datasets/lq45_financial_data_raw.csv')
scrapped_data.head()

Unnamed: 0,symbol,account,type,2019,2020,2021,2022,2023
0,ACES,Accounts Payable,BS,,164227200000.0,126723800000.0,144350100000.0,125623100000.0
1,ACES,Accounts Receivable,BS,,143482700000.0,66777640000.0,85854930000.0,198609800000.0
2,ACES,Accumulated Depreciation,BS,,-839618000000.0,-1867399000000.0,-1921765000000.0,-2003892000000.0
3,ACES,Additional Paid In Capital,BS,,440574900000.0,440574900000.0,440574900000.0,440574900000.0
4,ACES,Allowance For Doubtful Accounts Receivable,BS,,-55110070.0,-259305600.0,-3957857.0,-17999640.0


## Drop unused rows and columns

In [7]:
scrapped_data = scrapped_data.drop(columns=['2019'])

### Choose relevant rows

Drop all rows from df account, except:
- Total Revenue
- Net Income
- Total Assets
- Total Liabilities Net Minority Interest
- Cash Flowsfromusedin Operating Activities Direct
- Investing Cash Flow
- Financing Cash Flow

In [8]:
kept_values = [
    'Total Revenue',
    'Net Income',
    'Total Assets',
    'Total Liabilities Net Minority Interest',
    'Cash Flowsfromusedin Operating Activities Direct',
    'Investing Cash Flow',
    'Financing Cash Flow'
]

cleaned_data = scrapped_data[scrapped_data['account'].isin(kept_values)]

In [9]:
cleaned_data['account'] = cleaned_data['account'].replace(
    'Cash Flowsfromusedin Operating Activities Direct', 
    'Operating Cash Flow'
)

cleaned_data['account'] = cleaned_data['account'].replace(
    'Total Liabilities Net Minority Interest', 
    'Total Liabilities'
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['account'] = cleaned_data['account'].replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['account'] = cleaned_data['account'].replace(


### Table Transformation

In [10]:
#  Melt table to long format
df_long = pd.melt(cleaned_data, id_vars=['symbol', 'account', 'type'], var_name='Year', value_name='Value')

# Pivot tabel to get desired format
df_pivot = df_long.pivot(index=['symbol', 'Year'], columns='account', values='Value').reset_index()

# Change column names
df_pivot.columns.name = None  # Delete col name
df_pivot = df_pivot.rename_axis(None, axis=1)

In [11]:
# Convert all numerical columns to float64
for col in ['Financing Cash Flow', 'Investing Cash Flow', 'Operating Cash Flow', 'Net Income', 'Total Revenue', 'Total Assets', 'Total Liabilities']:
    df_pivot[col] = pd.to_numeric(df_pivot[col], errors='coerce')

### Add additional columns

In [12]:
# Add Liabilities to Assets % Column
df_pivot['Liabilities to Assets %'] = (df_pivot['Total Liabilities'] / df_pivot['Total Assets']) * 100

# Add Net Margin % Column
df_pivot['Net Margin %'] = (df_pivot['Net Income'] / df_pivot['Total Revenue']) * 100

In [14]:
# Order Columns for better readibility
ordered_columns = ['symbol', 'Year', 'Financing Cash Flow', 'Investing Cash Flow', 'Operating Cash Flow', 'Total Revenue', 'Net Income', 'Net Margin %', 'Total Assets', 'Total Liabilities', 'Liabilities to Assets %']
df_pivot = df_pivot[ordered_columns]

KeyError: "['Net Margin %Total Assets'] not in index"

In [None]:
df_pivot