Planning for this cycle

- Cleaning the dataset
- Listing rank of the best stock and chosen them
- Create a first solution faster

# 1.0 - Packages

## 1.1 - Libraries

In [3]:
# data manipulation
import pandas as pd
from datetime import datetime
from IPython.display import clear_output

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# jupyter viewer
from IPython.core.display import display, HTML

## 1.2 - Helper Functions

In [4]:
def jupyter_settings():
    
    # tamanho dos gráficos para o matplotlib
    plt.style.use('bmh')
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 20
    # colocando para aparecer todas as colunas e linhas e ponto flutuantes
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.options.display.float_format = '{:.3f}'.format
    pd.set_option('display.expand_frame_repr', False)
    # expandir a visualização da tela em 100%, como se fosse um zoom
    display(HTML('<style>.conteiner{width:100% !important;}</style>'))

    #aplicar as mesmas configurações no seaborn
    sns.set()

jupyter_settings()

# the path when the dataset be
raw_path = '/home/diegopmayer/Documents/projects/ideias/analise-acoes/notebooks/dataset/raw'

# 2.0 - Data Extraction

In [5]:
# select the data loaded on step 2.2 by cycle 01
dataset = pd.read_feather(f'{raw_path}/dataset.ftr')

In [6]:
dataset.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Currency,symbol
0,2007-09-27,5.87,6.13,5.87,5.87,365422,BRL,ABCB4
1,2007-09-28,5.92,5.92,5.83,5.83,94941,BRL,ABCB4
2,2007-10-01,5.79,6.11,5.79,5.94,265772,BRL,ABCB4
3,2007-10-02,5.93,5.96,5.75,5.93,34930,BRL,ABCB4
4,2007-10-03,5.76,5.91,5.76,5.91,34964,BRL,ABCB4


In [7]:
# transforming name of columns to lower case
dataset.columns = dataset.columns.str.lower()
dataset.columns

Index(['date', 'open', 'high', 'low', 'close', 'volume', 'currency', 'symbol'], dtype='object')

# 3.0 - Data Cleaning

## 3.1 - Dimention and NaN

In [8]:
# showing dimension and null values
print(f'{dataset.shape[0]} lines\n     {dataset.shape[1]} Columns')
print(f'{dataset.isna().sum()}')

959655 lines
     8 Columns
date        0
open        0
high        0
low         0
close       0
volume      0
currency    0
symbol      0
dtype: int64


- Conclusion
    - It's a almost 1 milion of lines, but it has a little columns, only 8 columns, the computing do it easly
    - Dont's have no one null information, you mustn't clean them

## 3.2 - Data Type

In [9]:
print(f'Types of data:\n{dataset.dtypes}')
dataset.tail()

Types of data:
date        datetime64[ns]
open               float64
high               float64
low                float64
close              float64
volume               int64
currency            object
symbol              object
dtype: object


Unnamed: 0,date,open,high,low,close,volume,currency,symbol
959650,2021-07-07,4.73,4.84,4.69,4.83,1399700,BRL,BMGB4
959651,2021-07-08,4.76,4.76,4.68,4.7,949100,BRL,BMGB4
959652,2021-07-12,4.74,4.86,4.71,4.84,1955700,BRL,BMGB4
959653,2021-07-13,4.84,4.9,4.75,4.89,1491600,BRL,BMGB4
959654,2021-07-14,4.89,4.91,4.83,4.86,333100,BRL,BMGB4


- Conclusion
    - It's all datatype ok

## 3.3 - Descriptive Statistics

In [10]:
dataset.describe(include="all", datetime_is_numeric=True).T

Unnamed: 0,count,unique,top,freq,mean,min,25%,50%,75%,max,std
date,959655.0,,,,2013-05-15 07:21:12.123419648,1995-01-02 00:00:00,2009-06-10 00:00:00,2014-07-25 00:00:00,2018-05-18 00:00:00,2021-07-14 00:00:00,
open,959655.0,,,,29.709,0.000,3.900,9.120,18.320,84410.360,346.521
high,959655.0,,,,30.156,0.000,3.990,9.280,18.640,20200.390,342.037
low,959655.0,,,,28.990,0.000,3.820,8.950,17.990,19468.930,326.774
close,959655.0,,,,29.573,0.000,3.900,9.110,18.310,20157.870,334.334
volume,959655.0,,,,12633708.334,0.000,11900.000,203418.000,1477400.000,77548822528.000,347732917.884
currency,959655.0,1.0,BRL,959655.0,,,,,,,
symbol,959655.0,374.0,BBDC3,13040.0,,,,,,,


Points of view

- there are a price values equal zero "0"
    - ✅ dropped 8 registers equal zero
- there are a volume egual zero
    - ⚠️ not solved, left to another cycle
- there are a highest price per stock in "max" with 20 thousand, is it correct?

### 3.3.1 - Price equal zero

In [11]:
# zero cotation
# getting the name of week in each day
dataset.loc[:, 'weekday'] = dataset.loc[:, 'date'].apply(lambda x: x.strftime('%A'))
# seeing the minimum value groupby day of week
dataset.groupby(by=['weekday']).min().reset_index()

Unnamed: 0,weekday,date,open,high,low,close,volume,currency,symbol
0,Friday,1995-01-06,0.0,0.0,0.0,0.0,0,BRL,AALR3
1,Monday,1995-01-02,0.01,0.01,0.01,0.01,0,BRL,AALR3
2,Thursday,1995-01-05,0.0,0.0,0.0,0.0,0,BRL,AALR3
3,Tuesday,1995-01-03,0.01,0.01,0.01,0.01,0,BRL,AALR3
4,Wednesday,1995-01-04,0.0,0.0,0.0,0.0,0,BRL,AALR3


In [12]:
# dropping lines with price equal zero
print(f"{dataset[dataset['close'] == 0].shape[0]} registers deleted")
dataset.drop(dataset[dataset['close'] == 0].index, axis=0, inplace=True)

8 registers deleted


### 3.3.2 - Operation volume equal zero

In [13]:
# getting year from date and create a new columns called year
dataset.loc[:, 'year'] = dataset.loc[:, 'date'].dt.year
dataset.loc[dataset[dataset['volume'] == 0].index].groupby(['year']).count().head()

Unnamed: 0_level_0,date,open,high,low,close,volume,currency,symbol,weekday
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1995,280,280,280,280,280,280,280,280,280
1996,385,385,385,385,385,385,385,385,385
1997,353,353,353,353,353,353,353,353,353
1998,225,225,225,225,225,225,225,225,225
1999,301,301,301,301,301,301,301,301,301
