# DataFrame

## 构造器

```python
'''
表示二维、大小可变、可以异构的表格数据
'''
pandas.DataFrame(data, index, columns, dtype, copy)
```

In [33]:
import pandas as pd
import numpy as np
import io

In [9]:
print('------- Constructing DataFrame from a dictionary -------')
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
print(df)

print('\n------- Constructing DataFrame from numpy ndarray -------')
d = np.arange(12).reshape((4, 3))
df = pd.DataFrame(data=d, index=['a', 'b', 'c', 'd'], columns=['x', 'y', 'z'], dtype=np.int8)
print(df)
df.dtypes

------- Constructing DataFrame from a dictionary -------
   col1  col2
0     1     3
1     2     4

------- Constructing DataFrame from numpy ndarray -------
   x   y   z
a  0   1   2
b  3   4   5
c  6   7   8
d  9  10  11


x    int8
y    int8
z    int8
dtype: object

## 属性和基础数据

```python
'''
Dataframe 的属性
'''
DataFrame.index
DataFrame.columns
DataFrame.dtypes
DataFrame.values  # 以 Numpy 格式返回 DataFrame 的值
DataFrame.axes
DataFrame.ndim  # DataFrame 轴的个数或数组的维度
DataFrame.size  # DataFrame 中元素的个数
DataFrame.shape
DataFrame.empty
```

In [19]:
(df.index, df.columns)

(Index(['a', 'b', 'c', 'd'], dtype='object'),
 Index(['x', 'y', 'z'], dtype='object'))

In [13]:
df.dtypes

x    int8
y    int8
z    int8
dtype: object

In [14]:
df.values

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]], dtype=int8)

In [15]:
df.axes

[Index(['a', 'b', 'c', 'd'], dtype='object'),
 Index(['x', 'y', 'z'], dtype='object')]

In [16]:
df.ndim

2

In [21]:
(df.size, df.shape)

(12, (4, 3))

In [22]:
df.empty

False

```python
'''
打印 DataFrame 概要
 - verbose     :bool, 是否打印完整概要
 - buf         :writable buffer, 默认为 sys.stdout
 - max_cols    :int, 不显示大于 max_cols 数目的列
 - memory_usage:bool/str, 是否显示内存占用大小；指定'deep'时计算真实的内存占用，否则根据列的数据类型以及行数估计内存占用
 - show_counts :bool, 是否显示 non-null 的数目
'''
Dataframe.info(verbose, buf, max_cols, memory_usage, show_counts)
```

In [35]:
int_values = [1, 2, 3, 4, 5]
text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame({"int_col": int_values, "text_col": text_values, "float_values": float_values})

line = "=========="
print(f'{line} DataFrame {line}')
print(df)

print(f'{line} Info of all columns {line}')
print(df.info(verbose=True))

print(f'{line} A summary of columns count {line}')
print(df.info(verbose=False))

print(f'{line} Set "max_cols = 1" {line}')
print(df.info(verbose=True, max_cols=1))

print(f'{line} Do not show non-null counts {line}')
print(df.info(verbose=True, show_counts=False))

print(f'{line} Write info to a text file {line}')
buffer = io.StringIO
df.info(buf=buffer)
s = buffer.getvalue()
with open("df_info.txt", "w", encoding="utf-8") as f:
    f.write(s)

print(f'{line} Big DataFrame {line}')
df = pd.DataFrame({"column_1": np.random.choice(['a', 'b', 'c'], 10 ** 6),
                   "column_2": np.random.choice(['a', 'b', 'c'], 10 ** 6),
                   "column_3": np.random.choice(['a', 'b', 'c'], 10 ** 6)})
print(df.info())

print(f'{line} Deep introspection mode {line}')
print(df.info(memory_usage='deep'))

   int_col text_col  float_values
0        1    alpha          0.00
1        2     beta          0.25
2        3    gamma          0.50
3        4    delta          0.75
4        5  epsilon          1.00
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   int_col       5 non-null      int64  
 1   text_col      5 non-null      object 
 2   float_values  5 non-null      float64
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Columns: 3 entries, int_col to float_values
dtypes: float64(1), int64(1), object(1)
memory usage: 252.0+ bytes
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column        Dtype  
---  ------        -----  
 0   int_col       int64  
 1   text_col      object 
 2   float_values

```python
'''
根据列的数据类型选择 Dataframe 的子集
'''
DataFrame.select_dtypes(include, exclude)
```

In [37]:
int_values = [1, 2, 3, 4, 5]
text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame({"int_col": int_values, "text_col": text_values, "float_values": float_values})

print(f'{line} DataFrame {line}')
print(df)

print(f'{line} Include number {line}')
print(df.select_dtypes(include='number'))

print(f'{line} Exclude number {line}')
print(df.select_dtypes(exclude='number'))

   int_col text_col  float_values
0        1    alpha          0.00
1        2     beta          0.25
2        3    gamma          0.50
3        4    delta          0.75
4        5  epsilon          1.00
   int_col  float_values
0        1          0.00
1        2          0.25
2        3          0.50
3        4          0.75
4        5          1.00
  text_col
0    alpha
1     beta
2    gamma
3    delta
4  epsilon
