# Pandas

In [4]:
import numpy as np
import pandas as pd

Pandas - расширение numpy (структурированные массивы). Строки и столбцы индексируются метками, а не только числовыми значениями

Структуры: Series, DataFrame, Index
#### Series

In [9]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data)
print(type(data))

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
<class 'pandas.core.series.Series'>


In [13]:
print(data.values, type(data.values))
print(data.index, type(data.index))

[0.25 0.5  0.75 1.  ] <class 'numpy.ndarray'>
RangeIndex(start=0, stop=4, step=1) <class 'pandas.core.indexes.range.RangeIndex'>


In [17]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
print(data[0])
print(data[1:3])

0.25
1    0.50
2    0.75
dtype: float64


In [23]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['a'])
print(data['b':'d'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25
b    0.50
c    0.75
d    1.00
dtype: float64


In [25]:
print(type(data.index))

<class 'pandas.core.indexes.base.Index'>


In [27]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 10, 7, 'd']) #необязательно использовать 
                                                                #индексы одного типа
print(data)
print(data[1])
print(data[10:'d'])

1     0.25
10    0.50
7     0.75
d     1.00
dtype: float64
0.25
10    0.50
7     0.75
d     1.00
dtype: float64


In [29]:
population_dict = {
    'city 1': 1001,
    'city 2': 1002,
    'city 3': 1003,
    'city 4': 1004,
    'city 5': 1005,
}

population = pd.Series(population_dict)
print(population)
print(population['city 4' : 'city 5'])

city 1    1001
city 2    1002
city 3    1003
city 4    1004
city 5    1005
dtype: int64
city 4    1004
city 5    1005
dtype: int64


Для создания Series можно использовать:
1. Списки Python и массивы Numpy
2. Скалярные значения
3. Словари

#### DataFrame 
Двумерный массив с явно определнными индексами. На него можно смотреть как на последовательность "согласованных" объектов Series

In [34]:
population_dict = {
    'city 1': 1001,
    'city 2': 1002,
    'city 3': 1003,
    'city 4': 1004,
    'city 5': 1005,
}

area_dict = {
    'city 1': 9991,
    'city 2': 9992,
    'city 3': 9993,
    'city 4': 9994,
    'city 5': 9995,
}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

print(population)
print(area)

city 1    1001
city 2    1002
city 3    1003
city 4    1004
city 5    1005
dtype: int64
city 1    9991
city 2    9992
city 3    9993
city 4    9994
city 5    9995
dtype: int64


In [36]:
states = pd.DataFrame({
    'population': population,
    'area': area
})
print(states)

        population  area
city 1        1001  9991
city 2        1002  9992
city 3        1003  9993
city 4        1004  9994
city 5        1005  9995


In [38]:
print(states.values)
print(states.index)
print(states.columns)

[[1001 9991]
 [1002 9992]
 [1003 9993]
 [1004 9994]
 [1005 9995]]
Index(['city 1', 'city 2', 'city 3', 'city 4', 'city 5'], dtype='object')
Index(['population', 'area'], dtype='object')


In [40]:
print(type(states.values))
print(type(states.index))
print(type(states.columns))

<class 'numpy.ndarray'>
<class 'pandas.core.indexes.base.Index'>
<class 'pandas.core.indexes.base.Index'>


In [44]:
print(states['area'])

city 1    9991
city 2    9992
city 3    9993
city 4    9994
city 5    9995
Name: area, dtype: int64


DataFrame - способы создания:
1. Через объекты Series
2. Списки словарей
3. Словари объектов Series
4. Структурированный массив Numpy
5. Двумерный массив Numpy

#### Index
Способ организации ссылки на данные объектов Series и DataFrame. Index - неизменяем, упорядочен, является мультимножеством (могут быть повторяющиеся значения)

In [52]:
ind = pd.Index([2, 3, 5, 7, 11])
print(ind[1])
print(ind[::2])

3
Index([2, 5, 11], dtype='int64')


Index следует соглашениям объекта set() в python

In [57]:
indA = pd.Index([1, 2, 3, 4, 5])
indB = pd.Index([2, 3, 4, 5, 6])
print(indA.intersection(indB))

Index([2, 3, 4, 5], dtype='int64')


#### Выборка данных
##### Из series

In [66]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print('a' in data)
print('z' in data)
print(data.keys())
print(list(data.items()))

True
False
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]


In [68]:
data['a'] = 100
data['z'] = 1000
print(data)

a     100.00
b       0.50
c       0.75
d       1.00
z    1000.00
dtype: float64


На Series можно смотреть как на словарь и как на одномерный массив

In [79]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data['a':'c'])
print(data[0:2])
print(data[(data > 0.5) & (data < 1)])
print(data[['a', 'd']])

a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
c    0.75
dtype: float64
a    0.25
d    1.00
dtype: float64


In [83]:
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[1, 3, 10, 15])
print(data[1])
#атрибуты-индексаторы
print(data.loc[1])
print(data.iloc[1])

0.25
0.25
0.5


In [104]:
population_dict = {
    'city 1': 1001,
    'city 2': 1002,
    'city 3': 1003,
    'city 4': 1004,
    'city 5': 1005,
}

area_dict = {
    'city 1': 9991,
    'city 2': 9992,
    'city 3': 9993,
    'city 4': 9994,
    'city 5': 9995,
}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

data = pd.DataFrame({
    'population': population,
    'area': area
})
print(data)

        population  area
city 1        1001  9991
city 2        1002  9992
city 3        1003  9993
city 4        1004  9994
city 5        1005  9995


##### Из DataFrame

In [96]:
print(data['area'])
print(data.area)

city 1    9991
city 2    9992
city 3    9993
city 4    9994
city 5    9995
Name: area, dtype: int64
city 1    9991
city 2    9992
city 3    9993
city 4    9994
city 5    9995
Name: area, dtype: int64


In [102]:
data['new'] = data['area'] / data['population']
print(data)

        population  area       new
city 1        1001  9991  9.981019
city 2        1002  9992  9.972056
city 3        1003  9993  9.963111
city 4        1004  9994  9.954183
city 5        1005  9995  9.945274


In [108]:
population = pd.Series(population_dict)
area = pd.Series(area_dict)

data = pd.DataFrame({
    'population': population,
    'area': area
})
print(data)
print(data.values)
print(data.T)
print(data['area'])
print(data.values[0:3])

        population  area
city 1        1001  9991
city 2        1002  9992
city 3        1003  9993
city 4        1004  9994
city 5        1005  9995
[[1001 9991]
 [1002 9992]
 [1003 9993]
 [1004 9994]
 [1005 9995]]
            city 1  city 2  city 3  city 4  city 5
population    1001    1002    1003    1004    1005
area          9991    9992    9993    9994    9995
city 1    9991
city 2    9992
city 3    9993
city 4    9994
city 5    9995
Name: area, dtype: int64
[[1001 9991]
 [1002 9992]
 [1003 9993]]


In [124]:
#атрибуты-индексаторы
data['pop'] = data['population']
print(data)
print(data.iloc[:3, 1:2])
print(data.loc['city 1', 'area':'pop'])
print(data.loc[data['pop'] > 1002, ['area', 'pop']])

data.iloc[0, 2] = 999999
print(data)

        population  area   pop
city 1        1001  9991  1001
city 2        1002  9992  1002
city 3        1003  9993  1003
city 4        1004  9994  1004
city 5        1005  9995  1005
        area
city 1  9991
city 2  9992
city 3  9993
area    9991
pop     1001
Name: city 1, dtype: int64
        area   pop
city 3  9993  1003
city 4  9994  1004
city 5  9995  1005
        population  area     pop
city 1        1001  9991  999999
city 2        1002  9992    1002
city 3        1003  9993    1003
city 4        1004  9994    1004
city 5        1005  9995    1005


In [132]:
rng = np.random.default_rng()
s = pd.Series(rng.integers(0, 10, 4))
print(s)

0    5
1    1
2    3
3    2
dtype: int64


In [134]:
print(np.exp(s))

0    148.413159
1      2.718282
2     20.085537
3      7.389056
dtype: float64


In [140]:
population_dict = {
    'city 1': 1001,
    'city 2': 1002,
    'city 3': 1003,
    'city 41': 1004,
    'city 51': 1005,
}

area_dict = {
    'city 1': 9991,
    'city 2': 9992,
    'city 3': 9993,
    'city 42': 9994,
    'city 52': 9995,
}

population = pd.Series(population_dict)
area = pd.Series(area_dict)

data = pd.DataFrame({
    'population': population,
    'area': area
})

print(data) #объединение индексов, NaN = not a number

         population    area
city 1       1001.0  9991.0
city 2       1002.0  9992.0
city 3       1003.0  9993.0
city 41      1004.0     NaN
city 42         NaN  9994.0
city 51      1005.0     NaN
city 52         NaN  9995.0


In [146]:
dfA = pd.DataFrame(rng.integers(0, 10, (2, 2)), columns=['a', 'b'])
dfB = pd.DataFrame(rng.integers(0, 10, (3, 3)), columns=['a', 'b', 'c'])
print(dfA)
print(dfB)
print(dfA + dfB)

   a  b
0  2  8
1  6  6
   a  b  c
0  4  4  1
1  5  1  3
2  0  4  5
      a     b   c
0   6.0  12.0 NaN
1  11.0   7.0 NaN
2   NaN   NaN NaN


In [152]:
rng = np.random.default_rng(1)
A =rng.integers(0, 10, (3, 4))
print(A)
print(A[0])
print(A - A[0]) #транслирование

[[4 5 7 9]
 [0 1 8 9]
 [2 3 8 4]]
[4 5 7 9]
[[ 0  0  0  0]
 [-4 -4  1  0]
 [-2 -2  1 -5]]


In [154]:
df = pd.DataFrame(A, columns=['a', 'b', 'c', 'd'])
print(df)
print(df.iloc[0])
print(df - df.iloc[0])

   a  b  c  d
0  4  5  7  9
1  0  1  8  9
2  2  3  8  4
a    4
b    5
c    7
d    9
Name: 0, dtype: int64
   a  b  c  d
0  0  0  0  0
1 -4 -4  1  0
2 -2 -2  1 -5


In [156]:
print(df.iloc[0, ::2])
print(df - df.iloc[0, ::2])

a    4
c    7
Name: 0, dtype: int64
     a   b    c   d
0  0.0 NaN  0.0 NaN
1 -4.0 NaN  1.0 NaN
2 -2.0 NaN  1.0 NaN


#### NA - значения
NaN, null

Два способа хранения отсутствующих значений:
1. Индикаторы Nan, None
2. null

None - объект(накладные расходы). Не работает с sum, min

In [162]:
val1 = np.array([1, 2, None, 3])
print(np.sum(val1))

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

In [168]:
val1 = np.array([1, 2, np.nan, 3])
print(np.sum(val1))
print(np.nansum(val1))

nan
6.0


In [170]:
x = pd.Series(range(10), dtype=int)
print(x)
x[0] = None
x[1] = np.nan

print(x)

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: int64
0    NaN
1    NaN
2    2.0
3    3.0
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64


In [172]:
x1 = pd.Series(['a', 'b', 'c'])
x1[0] = None
x1[1] = np.nan
print(x1)

0    None
1     NaN
2       c
dtype: object


In [178]:
x2 = pd.Series([1, 2, 3, np.nan, None, pd.NA], dtype='Int32')
print(x2)

0       1
1       2
2       3
3    <NA>
4    <NA>
5    <NA>
dtype: Int32


In [182]:
print(x2[x2.notnull()])

0    1
1    2
2    3
dtype: Int32


In [188]:
print(x2.dropna())

0    1
1    2
2    3
dtype: Int32


In [196]:
df = pd.DataFrame(
    [
        [1, 2, 3, np.nan, None, pd.NA],
        [1, 2, 3, 4, 5, 6],
        [1, np.nan, 3, 4, np.nan, 6]
    ]
)
print(df)
print(df.dropna())
print(df.dropna(axis=0))
print(df.dropna(axis=1))

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6
   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0  2
0  1  3
1  1  3
2  1  3


In [204]:
print(df.dropna(axis=0, how='all')) #все значения NA
print(df.dropna(axis=0, how='any')) #хотя бы одно
print(df.dropna(axis=0, thresh=2)) #минимум x

   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6
   0    1  2    3    4  5
1  1  2.0  3  4.0  5.0  6
   0    1  2    3    4     5
0  1  2.0  3  NaN  NaN  <NA>
1  1  2.0  3  4.0  5.0     6
2  1  NaN  3  4.0  NaN     6
