#### Многомерный pandas

In [5]:
import pandas as pd
import numpy as np
#Если размерность данных больше 2, то используют иерархическую индексацию (мультииндекс)
index = [
    ('city_1', 2010),
    ('city_1', 2020),
    ('city_2', 2010),
    ('city_2', 2020),
    ('city_3', 2010),
    ('city_3', 2020),
]

population = [
    101,
    201,
    102,
    202,
    103,
    203
]

pop = pd.Series(population, index=index)
print(pop)

(city_1, 2010)    101
(city_1, 2020)    201
(city_2, 2010)    102
(city_2, 2020)    202
(city_3, 2010)    103
(city_3, 2020)    203
dtype: int64


In [7]:
print(pop[ [i for i in pop.index if i[1] == 2020] ])

(city_1, 2020)    201
(city_2, 2020)    202
(city_3, 2020)    203
dtype: int64


In [9]:
# MultiIndex
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop)

city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64


In [11]:
print(pop[:, 2020])

city_1    201
city_2    202
city_3    203
dtype: int64


In [13]:
pop_df = pop.unstack()
print(pop_df)

        2010  2020
city_1   101   201
city_2   102   202
city_3   103   203


In [15]:
print(pop_df.stack())

city_1  2010    101
        2020    201
city_2  2010    102
        2020    202
city_3  2010    103
        2020    203
dtype: int64


In [17]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    
    ('city_3', 2020, 1),
    ('city_3', 2020, 2),
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]
pop = pd.Series(population, index=index)
print(pop)

(city_1, 2010, 1)     101
(city_1, 2010, 2)    1010
(city_1, 2020, 1)     201
(city_1, 2020, 2)    2010
(city_2, 2010, 1)     102
(city_2, 2010, 2)    1020
(city_2, 2020, 1)     202
(city_2, 2020, 2)    2020
(city_3, 2010, 1)     103
(city_3, 2010, 2)    1030
(city_3, 2020, 1)     203
(city_3, 2020, 2)    2030
dtype: int64


In [19]:
index = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(index)
print(pop)

city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64


In [21]:
print(pop[:, 2010])

city_1  1     101
        2    1010
city_2  1     102
        2    1020
city_3  1     103
        2    1030
dtype: int64


In [23]:
print(pop[:, :, 2])

city_1  2010    1010
        2020    2010
city_2  2010    1020
        2020    2020
city_3  2010    1030
        2020    2030
dtype: int64


In [25]:
pop_df = pop.unstack()
print(pop_df)

               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030


In [27]:
print(pop_df.stack())

city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64


In [29]:
pop_df = pd.DataFrame(
    {
        'total': pop,
        'something': [
            11,
            12,
            13,
            14,
            15,
            16,
            17,
            18,
            19,
            20,
            21,
            22
        ]
    }
)
print(pop_df)

               total  something
city_1 2010 1    101         11
            2   1010         12
       2020 1    201         13
            2   2010         14
city_2 2010 1    102         15
            2   1020         16
       2020 1    202         17
            2   2020         18
city_3 2010 1    103         19
            2   1030         20
       2020 1    203         21
            2   2030         22


In [31]:
print(pop_df['something'])

city_1  2010  1    11
              2    12
        2020  1    13
              2    14
city_2  2010  1    15
              2    16
        2020  1    17
              2    18
city_3  2010  1    19
              2    20
        2020  1    21
              2    22
Name: something, dtype: int64


In [33]:
pop_df_1 = pop_df.loc(['city_1', 'something'])
print(pop_df_1)

TypeError: unhashable type: 'list'

In [35]:
#Как можно создававть мультииндексы:
#1) Список массивов, задающих значение индекса на каждом уровне
i1 = pd.MultiIndex.from_arrays(
    [
        ['a', 'a', 'b', 'b'], 
        [1, 2, 1, 2]
    ]
)
print(i1)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [37]:
#2) Список кортежей, задающих значение индекса в каждой точке
i2 = pd.MultiIndex.from_tuples(
    [
        ('a', 1),
        ('a', 2), 
        ('b', 1), 
        ('b', 2)
    ]
)
print(i2)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [39]:
#3) Через декартово произведение обычных индексов
i3 = pd.MultiIndex.from_product(
    [
    ['a', 'b'], 
    [1, 2]
    ]
)
print(i3)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [41]:
#4) Описание внутреннего представления: levels, codes
i4 = pd.MultiIndex(
    levels = [
        ['a', 'b'],
        [1, 2]
    ],
    codes = [
        [0, 0, 1, 1],
        [0, 1, 0, 1]
    ]
)
print(i4)

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )


In [43]:
#уровням можно задавть названия
data = {
    ('city_1', 2010): 100,
    ('city_1', 2020): 101,
    ('city_2', 2010): 102,
    ('city_2', 2020): 103
}

s = pd.Series(data)
print(s)

city_1  2010    100
        2020    101
city_2  2010    102
        2020    103
dtype: int64


In [45]:
s.index.names = ['city', 'year']
print(s)

city    year
city_1  2010    100
        2020    101
city_2  2010    102
        2020    103
dtype: int64


In [47]:
index = pd.MultiIndex.from_product(
    [
        ['city_1', 'city_2'], 
        [2010, 2020]
    ],
    names=['city', 'year']
)
print(index)

MultiIndex([('city_1', 2010),
            ('city_1', 2020),
            ('city_2', 2010),
            ('city_2', 2020)],
           names=['city', 'year'])


In [49]:
columns = pd.MultiIndex.from_product(
    [
        ['person_1', 'person_2', 'person_3'], 
        ['jpb_1', 'job_2']
    ],
    names=['worker', 'job']
)

In [51]:
rng = np.random.default_rng(1)
data = rng.random((4, 6))
print(data)

[[0.51182162 0.9504637  0.14415961 0.94864945 0.31183145 0.42332645]
 [0.82770259 0.40919914 0.54959369 0.02755911 0.75351311 0.53814331]
 [0.32973172 0.7884287  0.30319483 0.45349789 0.1340417  0.40311299]
 [0.20345524 0.26231334 0.75036467 0.28040876 0.48519097 0.9807372 ]]


In [53]:
data_df = pd.DataFrame(data, index=index, columns=columns)
print(data_df)

worker       person_1            person_2            person_3          
job             jpb_1     job_2     jpb_1     job_2     jpb_1     job_2
city   year                                                            
city_1 2010  0.511822  0.950464  0.144160  0.948649  0.311831  0.423326
       2020  0.827703  0.409199  0.549594  0.027559  0.753513  0.538143
city_2 2010  0.329732  0.788429  0.303195  0.453498  0.134042  0.403113
       2020  0.203455  0.262313  0.750365  0.280409  0.485191  0.980737


In [55]:
#Индексация и срезы (по мультииндексу)
data = {
    ('city_1', 2010): 100,
    ('city_1', 2020): 101,
    ('city_2', 2010): 102,
    ('city_2', 2020): 103,
    ('city_3', 2010): 104,
    ('city_3', 2020): 105,
}

s = pd.Series(data)
s.index.names = ['city', 'year']
print(s['city_1', 2010])

100


In [57]:
print(s['city_1'])

year
2010    100
2020    101
dtype: int64


In [59]:
print(s.loc['city_1':'city_2'])

city    year
city_1  2010    100
        2020    101
city_2  2010    102
        2020    103
dtype: int64


In [61]:
print(s[:, 2010])

city
city_1    100
city_2    102
city_3    104
dtype: int64


In [63]:
print(s[s > 2010])

Series([], dtype: int64)


In [65]:
#перегруппировка мультииндексов
index = pd.MultiIndex.from_product(
    [
        ['a', 'c', 'b'],
        [1, 2]
    ]
)
data = pd.Series(rng.random(6), index=index)
data.index.names = ['char', 'int']
print(data)
#print(data['a':'b'])
data = data.sort_index()
print(data)
print(data['a':'b'])

char  int
a     1      0.961657
      2      0.724790
c     1      0.541227
      2      0.276891
b     1      0.160652
      2      0.969925
dtype: float64
char  int
a     1      0.961657
      2      0.724790
b     1      0.160652
      2      0.969925
c     1      0.541227
      2      0.276891
dtype: float64
char  int
a     1      0.961657
      2      0.724790
b     1      0.160652
      2      0.969925
dtype: float64


In [67]:
index = [
    ('city_1', 2010, 1),
    ('city_1', 2010, 2),
    
    ('city_1', 2020, 1),
    ('city_1', 2020, 2),
    
    ('city_2', 2010, 1),
    ('city_2', 2010, 2),
    
    ('city_2', 2020, 1),
    ('city_2', 2020, 2),
    
    ('city_3', 2010, 1),
    ('city_3', 2010, 2),
    
    ('city_3', 2020, 1),
    ('city_3', 2020, 2),
]

population = [
    101,
    1010,
    201,
    2010,
    102,
    1020,
    202,
    2020,
    103,
    1030,
    203,
    2030
]
pop = pd.Series(population, index=index)
print(pop)

(city_1, 2010, 1)     101
(city_1, 2010, 2)    1010
(city_1, 2020, 1)     201
(city_1, 2020, 2)    2010
(city_2, 2010, 1)     102
(city_2, 2010, 2)    1020
(city_2, 2020, 1)     202
(city_2, 2020, 2)    2020
(city_3, 2010, 1)     103
(city_3, 2010, 2)    1030
(city_3, 2020, 1)     203
(city_3, 2020, 2)    2030
dtype: int64


In [69]:
i = pd.MultiIndex.from_tuples(index)
pop = pop.reindex(i)
print(pop)

city_1  2010  1     101
              2    1010
        2020  1     201
              2    2010
city_2  2010  1     102
              2    1020
        2020  1     202
              2    2020
city_3  2010  1     103
              2    1030
        2020  1     203
              2    2030
dtype: int64


In [71]:
print(pop.unstack(level=0))
print(pop.unstack(level=1))
print(pop.unstack(level=2))

        city_1  city_2  city_3
2010 1     101     102     103
     2    1010    1020    1030
2020 1     201     202     203
     2    2010    2020    2030
          2010  2020
city_1 1   101   201
       2  1010  2010
city_2 1   102   202
       2  1020  2020
city_3 1   103   203
       2  1030  2030
               1     2
city_1 2010  101  1010
       2020  201  2010
city_2 2010  102  1020
       2020  202  2020
city_3 2010  103  1030
       2020  203  2030


In [73]:
#Конкатенация
x = [[1, 2, 3]]
y = [[4, 5, 6]]
z = [[7, 8, 9]]
print(np.concatenate([x, y, z]))

[[1 2 3]
 [4 5 6]
 [7 8 9]]


In [75]:
print(np.concatenate([x, y, z], axis=0))
print(np.concatenate([x, y, z], axis=1))

[[1 2 3]
 [4 5 6]
 [7 8 9]]
[[1 2 3 4 5 6 7 8 9]]


In [77]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser2 = pd.Series(['d', 'e', 'f'], index=[4, 5, 6])
print(pd.concat([ser1, ser2]))

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object


In [79]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser2 = pd.Series(['d', 'e', 'f'], index=[1, 2, 6])
print(pd.concat([ser1, ser2]))

1    a
2    b
3    c
1    d
2    e
6    f
dtype: object


In [81]:
print(pd.concat([ser1, ser2], verify_integrity=False))

1    a
2    b
3    c
1    d
2    e
6    f
dtype: object


In [83]:
print(pd.concat([ser1, ser2], verify_integrity=True))

ValueError: Indexes have overlapping values: Index([1, 2], dtype='int64')

In [85]:
print(pd.concat([ser1, ser2], ignore_index=False))

1    a
2    b
3    c
1    d
2    e
6    f
dtype: object


In [87]:
print(pd.concat([ser1, ser2], ignore_index=True))

0    a
1    b
2    c
3    d
4    e
5    f
dtype: object


In [89]:
print(pd.concat([ser1, ser2], keys=['x', 'y']))

x  1    a
   2    b
   3    c
y  1    d
   2    e
   6    f
dtype: object


In [91]:
ser1 = pd.Series(['a', 'b', 'c'], index=[1, 2, 3])
ser2 = pd.Series(['b', 'e', 'f'], index=[4, 5, 6])
print(pd.concat([ser1, ser2], join='outer'))

1    a
2    b
3    c
4    b
5    e
6    f
dtype: object


In [93]:
print(pd.concat([ser1, ser2], join='inner'))

1    a
2    b
3    c
4    b
5    e
6    f
dtype: object
