## Extrair informação de uso de memória

In [1]:
import io
import re

def get_pandas_dataframe_info(df):
    buf = io.StringIO()
    df.info(buf=buf)
    
    info = buf.getvalue()
    
    result = re.search(r'.*memory usage:\s(.*)', info)
    return result.group(1)

## 1  -  Carregamento padrão (Sem otimizações):

In [2]:
import pandas as pd

In [3]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

In [4]:
df = pd.read_csv(data_source)

In [5]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
rows, cols = df.shape

In [8]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 891
Cols: 12
Memory Size: 83.7+ KB


## 2 - Limitar quantidade de linhas

In [9]:
import pandas as pd

In [10]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
nrows = 100

In [11]:
df = pd.read_csv(data_source, nrows=nrows)

In [12]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
rows, cols = df.shape

In [14]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 100
Cols: 12
Memory Size: 9.5+ KB


## 3 - Selecionar colunas desejadas

### 3.1 - Seleção por nomes

In [15]:
import pandas as pd

In [16]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
usecols = ['PassengerId', 'Survived', 'Pclass', 'Sex', 'Age', 'Embarked']

In [17]:
df = pd.read_csv(data_source, usecols=usecols)

In [18]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Embarked
0,1,0,3,male,22.0,S
1,2,1,1,female,38.0,C
2,3,1,3,female,26.0,S
3,4,1,1,female,35.0,S
4,5,0,3,male,35.0,S


In [19]:
rows, cols = df.shape

In [20]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 891
Cols: 6
Memory Size: 41.9+ KB


### 3.2 - Seleção por índices

In [21]:
import pandas as pd

In [22]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
usecols = [0, 1, 2, 4, 5, 7]

In [23]:
df = pd.read_csv(data_source, usecols=usecols)

In [24]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,Parch
0,1,0,3,male,22.0,0
1,2,1,1,female,38.0,0
2,3,1,3,female,26.0,0
3,4,1,1,female,35.0,0
4,5,0,3,male,35.0,0


In [25]:
rows, cols = df.shape

In [26]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 891
Cols: 6
Memory Size: 41.9+ KB


## 4 - Definindo tipos de dados

### 4.1 - Definição manual

In [41]:
import pandas as pd

In [42]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

In [43]:
df = pd.read_csv(data_source)

In [44]:
print(df.dtypes)

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [45]:
df.Sex = df.Sex.astype('category')
df.Embarked = df.Embarked.astype('category')
df.Survived = df.Survived.astype('category')
df.Pclass = df.Pclass.astype('category')
df.PassengerId = df.PassengerId.astype('int32')
df.Parch = df.Parch.astype('int32')
df.SibSp = df.SibSp.astype('int32')

In [46]:
print(df.dtypes)

PassengerId       int32
Survived       category
Pclass         category
Name             object
Sex            category
Age             float64
SibSp             int32
Parch             int32
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object


In [47]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 891
Cols: 6
Memory Size: 49.4+ KB


### 4.2 - Definição em tempo de carregamento

In [48]:
import pandas as pd

In [49]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
dtypes = {
    'Sex': 'category',
    'Embarked': 'category',
    'Survived': 'category',
    'Pclass': 'category',
    'PassengerId': 'int32',
    'Parch': 'int32',
    'SibSp': 'int32'
}

In [50]:
df = pd.read_csv(data_source, dtype=dtypes)

In [51]:
print(df.dtypes)

PassengerId       int32
Survived       category
Pclass         category
Name             object
Sex            category
Age             float64
SibSp             int32
Parch             int32
Ticket           object
Fare            float64
Cabin            object
Embarked       category
dtype: object


In [53]:
print(f'Rows: {rows}')
print(f'Cols: {cols}')
print(f'Memory Size: {get_pandas_dataframe_info(df)}')

Rows: 891
Cols: 6
Memory Size: 49.2+ KB


#### Carregando por partes (lazy)

In [69]:
import pandas as pd

In [70]:
data_source = 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'

In [71]:
df = pd.read_csv(data_source, chunksize=50)

In [72]:
print(f'{"Chunk":^15}|{"Memory Size":^15}|{"Rows":^15}|{"Cols":^15}')

for i, chunk in enumerate(df):
    chunk_memory_info = get_pandas_dataframe_info(chunk)
    chunk_rows, chunk_cols = chunk.shape
    print(f'{(i+1):^15}|{chunk_memory_info:^15}|{chunk_rows:^15}|{chunk_cols:^15}')

     Chunk     |  Memory Size  |     Rows      |     Cols      
       1       |    4.8+ KB    |      50       |      12       
       2       |    4.8+ KB    |      50       |      12       
       3       |    4.8+ KB    |      50       |      12       
       4       |    4.8+ KB    |      50       |      12       
       5       |    4.8+ KB    |      50       |      12       
       6       |    4.8+ KB    |      50       |      12       
       7       |    4.8+ KB    |      50       |      12       
       8       |    4.8+ KB    |      50       |      12       
       9       |    4.8+ KB    |      50       |      12       
      10       |    4.8+ KB    |      50       |      12       
      11       |    4.8+ KB    |      50       |      12       
      12       |    4.8+ KB    |      50       |      12       
      13       |    4.8+ KB    |      50       |      12       
      14       |    4.8+ KB    |      50       |      12       
      15       |    4.8+ KB    |      50