# InvestBot: a bot for stock classification using fundamentalist analysis

<p align='center'>
    <img src='img/bot.jpg'>
</p>

# 0.0 Imports

In [1]:
import pandas as pd
import numpy  as np
import os

from pandas_datareader import data as web

## 0.1. Helper Functions

## 0.2. Loading Data

### 0.2.1. Balance Sheet and Cash Flow Statement

In [40]:
# Collect balance sheet and cash flow statement
fundamentals = {}
files = os.listdir( "Balances" )
companies = []

for file in files:
    name = file[:-4]
    
    # Balance Sheet
    balance = pd.read_excel( f'Balances/{file}', sheet_name=0 )
    
    # Name of the company as a title in the first column
    balance.iloc[0, 0] = name
    
    # Make the first line a header
    balance.columns = balance.iloc[0]
    balance = balance.iloc[1:]
    
    # Make the first column an index
    balance = balance.set_index( name )
    
    # Cash Flow Statement
    cfs = pd.read_excel( f'Balances/{file}', sheet_name=1 )
    
    # Name of the company as a title in the first column
    cfs.iloc[0, 0] = name
    
    # Make the first line a header
    cfs.columns = cfs.iloc[0]
    cfs = cfs.iloc[1:]
    
    # Make the first column an index
    cfs = cfs.set_index( name )
    
    try:
        fundamentals[name] = balance.append( cfs )
        companies.append( name )
    except:
        pass



































































### 0.2.2. Stock Quotes

In [None]:
# Collect stock quotes
quotes = {}

for company in companies:
    try:
        quotes[company] = web.DataReader( f'{company}.SA', data_source='yahoo', start="06/30/2011", end="03/31/2020" )
    except:
        pass

In [20]:
# Saving quotes
for company in quotes:
    quotes[company].to_csv( f'Quotes/{company}.csv' )

In [41]:
# Collect stock quotes
quotes = {}
files = os.listdir( "Quotes" )

for file in files:
    name = file[:-4]
    quotes[name] = pd.read_csv( f'Quotes/{file}' ) 

### 0.2.2. Remove companies from the fundamentals that do not have quotes for the period collected

In [42]:
for company in companies:
    if company not in quotes:
        fundamentals.pop( company )

companies = list( quotes.keys() )

### 0.2.3. Remove companies that have empty quotes

In [43]:
for company in companies:
    if quotes[company].isnull().values.any():
        quotes.pop( company )
        fundamentals.pop( company )

companies = list( quotes.keys() )

### 0.2.4. Joining the fundamentals and the quotes

In [44]:
for company in fundamentals:
    # Fundamentals
    # Turn rows into columns
    table = fundamentals[company].T
    
    # Transform index to date type
    table.index = pd.to_datetime( table.index, format="%d/%m/%Y" )
    
    # Quotes
    # Put date column as index
    quote_table = quotes[company].set_index( "Date" )
    
    # Select only the column Adj Close
    quote_table = quote_table[['Adj Close']]
    
    # Merge
    table = table.merge( quote_table, right_index=True, left_index=True )
    
    # Inserting name in index column
    table.index.name = company
    
    fundamentals[company] = table

### 0.2.5. Treat columns

- Catch only companies that have the same columns
- Columns with repeated names
- Missing values

**1. Catch only companies that have the same columns**

In [45]:
columns = list ( fundamentals['PETR4'].columns )

for company in companies:
    if set( columns ) != set( fundamentals[company].columns ):
        fundamentals.pop( company )

**2. Columns with repeated names**

In [46]:
text_columns = ";".join( columns )

modified_columns = []
for column in columns:
    if columns.count( column ) == 2 and column not in modified_columns:
        text_columns = text_columns.replace( ";" + column + ";", ";" + column + "_1;", 1 )
        modified_columns.append( column )
        
columns = text_columns.split( ';' )

In [48]:
for company in fundamentals:
    fundamentals[company] = fundamentals[company][columns]

KeyError: "['Tributos Diferidos_1', 'Ativos Biológicos_1', 'Adiantamento para Futuro Aumento Capital_1', 'Estoques_1', 'Contas a Receber_1', 'Outros_1', 'Empréstimos e Financiamentos_1', 'Passivos com Partes Relacionadas_1', 'Despesas Antecipadas_1', 'Passivos sobre Ativos Não-Correntes a Venda e Descontinuados_1', 'Provisões_1'] not in index"

### 0.2.4. Making everything a single dataframe

In [21]:
copy_fundamentals = fundamentals.copy()

In [30]:
df = pd.DataFrame()

for company in copy_fundamentals:
    pd.concat( [df, copy_fundamentals[company]], axis=0, ignore_index=True )

df.to_csv('Fundamentals.csv' )
df.head()

ValueError: Length mismatch: Expected axis has 0 elements, new values have 80 elements

In [34]:
copy_fundamentals['PETR4']

Unnamed: 0_level_0,Ativo Total,Ativo Circulante,Caixa e Equivalentes de Caixa,Aplicações Financeiras,Contas a Receber,Estoques,Ativos Biológicos,Tributos a Recuperar,Despesas Antecipadas,Outros Ativos Circulantes,...,Receitas,Despesas,Resultado Antes Tributação/Participações,Provisão para IR e Contribuição Social,IR Diferido,Participações/Contribuições Estatutárias,Reversão dos Juros sobre Capital Próprio,Part. de Acionistas Não Controladores,Lucro/Prejuízo do Período,Adj Close
PETR4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-03-31,971645000.0,163562000.0,80382000.0,3346000.0,15866000.0,31236000.0,0,13150000.0,0.0,19582000.0,...,,,-66618000.0,-597000.0,17491000.0,,,1201000.0,-48523000.0,11.370116
2019-09-30,924465000.0,147601000.0,54882000.0,5427000.0,17495000.0,31583000.0,0,10788000.0,0.0,27426000.0,...,,,3429000.0,758000.0,-4696000.0,,,247000.0,9087000.0,21.939379
2017-06-30,808054000.0,142435000.0,77970000.0,3317000.0,14477000.0,26621000.0,0,8361000.0,0.0,11689000.0,...,,,6770000.0,-2573000.0,-3905000.0,,,24000.0,316000.0,9.327338
2017-03-31,788046000.0,134058000.0,60874000.0,2909000.0,14042000.0,26172000.0,0,8167000.0,0.0,21894000.0,...,,,7127000.0,-826000.0,-1494000.0,,,-358000.0,4449000.0,10.92588
2016-09-30,803206000.0,144753000.0,70060000.0,2542000.0,16953000.0,27627000.0,0,8709000.0,0.0,18862000.0,...,,,-17294000.0,-1009000.0,1980000.0,,,-135000.0,-16458000.0,10.232175
2016-06-30,818332000.0,132625000.0,62940000.0,2430000.0,17047000.0,28508000.0,0,9285000.0,0.0,12415000.0,...,,,1521000.0,-1911000.0,1289000.0,,,-529000.0,370000.0,7.102953
2016-03-31,859160000.0,146243000.0,77778000.0,2729000.0,18865000.0,29098000.0,0,10612000.0,0.0,7161000.0,...,,,-157000.0,-1637000.0,1413000.0,,,-865000.0,-1246000.0,6.265982
2015-09-30,931562000.0,176380000.0,99870000.0,4379000.0,21155000.0,32585000.0,0,10172000.0,0.0,8219000.0,...,,,-5199000.0,-814000.0,988000.0,,,1266000.0,-3759000.0,5.45917
2015-06-30,859299000.0,160380000.0,81166000.0,10478000.0,20050000.0,33771000.0,0,9927000.0,0.0,4988000.0,...,,,3581000.0,-905000.0,-1768000.0,,,-377000.0,531000.0,9.568627
2015-03-31,831948000.0,137565000.0,34450000.0,33828000.0,20737000.0,32031000.0,0,9674000.0,0.0,6845000.0,...,,,7551000.0,-979000.0,-2044000.0,,,802000.0,5330000.0,7.283921


# 1.0. Data Description

# 2.0. Feature Engineering

# 3.0. Data Filtering

# 4.0.Exploratory Data Analysis

# 5.0. Data Preparation

# 6.0. Feature Selection

# 7.0. Machine Learning Modeling

# 8.0. Hyperparameter Fine Tuning

# 9.0. Machine Learning Performance

# 10.0. Deploy Model to Production

# 11.0. Conclusions

In [33]:
columns

['Ativo Total',
 'Ativo Circulante',
 'Caixa e Equivalentes de Caixa',
 'Aplicações Financeiras',
 'Contas a Receber_1',
 'Estoques_1',
 'Ativos Biológicos_1',
 'Tributos a Recuperar',
 'Despesas Antecipadas_1',
 'Outros Ativos Circulantes',
 'Ativo Realizável a Longo Prazo',
 'Aplicações Financeiras Avaliadas a Valor Justo',
 'Aplicações Financeiras Avaliadas ao Custo Amortizado',
 'Contas a Receber',
 'Estoques',
 'Ativos Biológicos',
 'Tributos Diferidos_1',
 'Despesas Antecipadas',
 'Créditos com Partes Relacionadas',
 'Outros Ativos Não Circulantes',
 'Investimentos',
 'Imobilizado',
 'Intangível',
 'Diferido',
 'Passivo Total',
 'Passivo Circulante',
 'Obrigações Sociais e Trabalhistas',
 'Fornecedores',
 'Obrigações Fiscais',
 'Empréstimos e Financiamentos_1',
 'Passivos com Partes Relacionadas_1',
 'Dividendos e JCP a Pagar',
 'Outros_1',
 'Provisões_1',
 'Passivos sobre Ativos Não-Correntes a Venda e Descontinuados_1',
 'Passivo Não Circulante',
 'Empréstimos e Financiamentos'

In [37]:
df = pd.DataFrame( { 'a': [1], 'b': [2], 'c': [3] } )

columns = ['b', 'c', 'a']
df = df[columns]
df

Unnamed: 0,b,c,a
0,2,3,1


In [39]:
text_columns.split( ';' )

['Ativo Total',
 'Ativo Circulante',
 'Caixa e Equivalentes de Caixa',
 'Aplicações Financeiras',
 'Contas a Receber_1',
 'Estoques_1',
 'Ativos Biológicos_1',
 'Tributos a Recuperar',
 'Despesas Antecipadas_1',
 'Outros Ativos Circulantes',
 'Ativo Realizável a Longo Prazo',
 'Aplicações Financeiras Avaliadas a Valor Justo',
 'Aplicações Financeiras Avaliadas ao Custo Amortizado',
 'Contas a Receber',
 'Estoques',
 'Ativos Biológicos',
 'Tributos Diferidos_1',
 'Despesas Antecipadas',
 'Créditos com Partes Relacionadas',
 'Outros Ativos Não Circulantes',
 'Investimentos',
 'Imobilizado',
 'Intangível',
 'Diferido',
 'Passivo Total',
 'Passivo Circulante',
 'Obrigações Sociais e Trabalhistas',
 'Fornecedores',
 'Obrigações Fiscais',
 'Empréstimos e Financiamentos_1',
 'Passivos com Partes Relacionadas_1',
 'Dividendos e JCP a Pagar',
 'Outros_1',
 'Provisões_1',
 'Passivos sobre Ativos Não-Correntes a Venda e Descontinuados_1',
 'Passivo Não Circulante',
 'Empréstimos e Financiamentos'