## 3.1 Первое знакомство с Series 

In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'1.3.4'

In [3]:
s1 = pd.Series([4,3,-7,5])
s1

0    4
1    3
2   -7
3    5
dtype: int64

In [4]:
s1.values

array([ 4,  3, -7,  5], dtype=int64)

In [13]:
s2.values.dtype

dtype('int64')

In [5]:
s1.index

RangeIndex(start=0, stop=4, step=1)

In [31]:
s2 = pd.Series([4,3,-7,5], index=['a','b','c','d'])
s2

a    4
b    3
c   -7
d    5
dtype: int64

In [7]:
s2.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [10]:
s2['c']

-7

In [17]:
s2[2] = 7
s2

a    4
b    3
c    7
d    5
dtype: int64

In [12]:
s2[['b','d']]

b    3
d    5
dtype: int64

## 3.3 Поподробнее про серии: индексы, сложение, проверка на NaN 

### индексы 

In [18]:
s2[s2>4]

c    7
d    5
dtype: int64

In [19]:
s2+100

a    104
b    103
c    107
d    105
dtype: int64

In [20]:
d = {'a': 1, 'b': 2, 'c': 3}

In [21]:
'a' in d

True

In [22]:
'a' in s2

True

In [27]:
d = {'Moscow': 1400, 'Murmansk': 900, 'Kazan': 200, 'Yakutsk': 400, 'Ufa': 300}
d

{'Moscow': 1400, 'Murmansk': 900, 'Kazan': 200, 'Yakutsk': 400, 'Ufa': 300}

In [28]:
s1 = pd.Series(d)
s1

Moscow      1400
Murmansk     900
Kazan        200
Yakutsk      400
Ufa          300
dtype: int64

In [30]:
s1['Kazan']

200

### проверка на NaN  

In [34]:
s2 = pd.Series(d, index= ['Yakutsk', 'Ufa', 'Moscow', 'Murmansk', 'Vladivostok'])
s2

Yakutsk         400.0
Ufa             300.0
Moscow         1400.0
Murmansk        900.0
Vladivostok       NaN
dtype: float64

In [37]:
res = pd.isnull(s2)
res.values

array([False, False, False, False,  True])

In [39]:
s2.isnull()

Yakutsk        False
Ufa            False
Moscow         False
Murmansk       False
Vladivostok     True
dtype: bool

In [38]:
pd.notnull(s2)

Yakutsk         True
Ufa             True
Moscow          True
Murmansk        True
Vladivostok    False
dtype: bool

In [40]:
s2.notnull()

Yakutsk         True
Ufa             True
Moscow          True
Murmansk        True
Vladivostok    False
dtype: bool

### сложение
Важно запомнить, что при сложении серий:

1) не обязательно чтобы они были одинаковой длины

2) сложение будет происходить по индексам

3) если в одной из серий есть индекс, которого нет в другой серии, то индекс добавляется в результирующую серию, но значение присваивается NaN### 

In [44]:
s1

Moscow      1400
Murmansk     900
Kazan        200
Yakutsk      400
Ufa          300
dtype: int64

In [45]:
s2

Yakutsk         400.0
Ufa             300.0
Moscow         1400.0
Murmansk        900.0
Vladivostok       NaN
dtype: float64

In [46]:
s1+s2

Kazan             NaN
Moscow         2800.0
Murmansk       1800.0
Ufa             600.0
Vladivostok       NaN
Yakutsk         800.0
dtype: float64

## 3.5 Первое знакомство с DataFrame 

In [48]:
s1 = pd.Series(['Ivan', 'Andrey', 'Igor'])

In [49]:
s1.name = 'user_names'

In [52]:
df = pd.DataFrame(s1)
df

Unnamed: 0,user_names
0,Ivan
1,Andrey
2,Igor


In [53]:
df['user_names']

0      Ivan
1    Andrey
2      Igor
Name: user_names, dtype: object

In [46]:
data = {'city': ['Moscow', 'Moscow', 'Moscow', 'Kazan', 'Kazan', 'Kazan'],
       'year': [2022,2021,2020,2022,2021,2020],
       'visits': [500,2000,1500,100,230,200]}
data

{'city': ['Moscow', 'Moscow', 'Moscow', 'Kazan', 'Kazan', 'Kazan'],
 'year': [2022, 2021, 2020, 2022, 2021, 2020],
 'visits': [500, 2000, 1500, 100, 230, 200]}

In [47]:
df = pd.DataFrame(data)
df

Unnamed: 0,city,year,visits
0,Moscow,2022,500
1,Moscow,2021,2000
2,Moscow,2020,1500
3,Kazan,2022,100
4,Kazan,2021,230
5,Kazan,2020,200


In [10]:
df2 = pd.DataFrame(data, columns=['visits','city','year1'], index = ['a','b','c','d','e','f'])
df2

Unnamed: 0,visits,city,year1
a,500,Moscow,
b,2000,Moscow,
c,1500,Moscow,
d,100,Kazan,
e,230,Kazan,
f,200,Kazan,


In [11]:
df.head()

Unnamed: 0,city,year,visits
0,Moscow,2022,500
1,Moscow,2021,2000
2,Moscow,2020,1500
3,Kazan,2022,100
4,Kazan,2021,230


In [12]:
df['old_visits'] = [1,2,3,4,5,6]
df

Unnamed: 0,city,year,visits,old_visits
0,Moscow,2022,500,1
1,Moscow,2021,2000,2
2,Moscow,2020,1500,3
3,Kazan,2022,100,4
4,Kazan,2021,230,5
5,Kazan,2020,200,6


In [13]:
s1 = pd.Series([1,10,20], index = [0,3,5])
s1

0     1
3    10
5    20
dtype: int64

In [14]:
df['old_visits'] = s1
df

Unnamed: 0,city,year,visits,old_visits
0,Moscow,2022,500,1.0
1,Moscow,2021,2000,
2,Moscow,2020,1500,
3,Kazan,2022,100,10.0
4,Kazan,2021,230,
5,Kazan,2020,200,20.0


In [15]:
df[df['old_visits'].isnull()].index

Int64Index([1, 2, 4], dtype='int64')

In [16]:
df['is_correct'] = df['visits']<1000
df

Unnamed: 0,city,year,visits,old_visits,is_correct
0,Moscow,2022,500,1.0,True
1,Moscow,2021,2000,,False
2,Moscow,2020,1500,,False
3,Kazan,2022,100,10.0,True
4,Kazan,2021,230,,True
5,Kazan,2020,200,20.0,True


## 3.6 Поподробнее про DataFrame: индексы, вложенные словари, del и .T 

In [20]:
df['old_visits'] = np.arange(6)
df

Unnamed: 0,city,year,visits,old_visits,is_correct
0,Moscow,2022,500,0,True
1,Moscow,2021,2000,1,False
2,Moscow,2020,1500,2,False
3,Kazan,2022,100,3,True
4,Kazan,2021,230,4,True
5,Kazan,2020,200,5,True


### del

In [21]:
del df['is_correct']
# удалить колонку
df

Unnamed: 0,city,year,visits,old_visits
0,Moscow,2022,500,0
1,Moscow,2021,2000,1
2,Moscow,2020,1500,2
3,Kazan,2022,100,3
4,Kazan,2021,230,4
5,Kazan,2020,200,5


### вложенные словари 

In [23]:
data2 = {'Moscow': {2020: 1500, 2021: 2000, 2022: 500},
        'Kazan': {2020: 200, 2021: 230, 2022: 100}}
data2

{'Moscow': {2020: 1500, 2021: 2000, 2022: 500},
 'Kazan': {2020: 200, 2021: 230, 2022: 100}}

In [24]:
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,Moscow,Kazan
2020,1500,200
2021,2000,230
2022,500,100


### .T 

In [25]:
df2.T

Unnamed: 0,2020,2021,2022
Moscow,1500,2000,500
Kazan,200,230,100


In [26]:
df.values

array([['Moscow', 2022, 500, 0],
       ['Moscow', 2021, 2000, 1],
       ['Moscow', 2020, 1500, 2],
       ['Kazan', 2022, 100, 3],
       ['Kazan', 2021, 230, 4],
       ['Kazan', 2020, 200, 5]], dtype=object)

In [28]:
df.index

RangeIndex(start=0, stop=6, step=1)

In [30]:
df.columns

Index(['city', 'year', 'visits', 'old_visits'], dtype='object')

### индексы 

In [32]:
s2

a    4
b    3
c   -7
d    5
dtype: int64

In [34]:
index = s1.index
index

Int64Index([0, 3, 5], dtype='int64')

In [35]:
index[0] = 'h'
# частичная переиндексация невозможна

TypeError: Index does not support mutable operations

In [37]:
index2 = pd.Index(['A', 'B', 'C'])
index2

Index(['A', 'B', 'C'], dtype='object')

In [38]:
'A' in index2

True

In [39]:
s2 = pd.Series([1,2,3], index = index2)
s2

A    1
B    2
C    3
dtype: int64

## 3.8 Функции reindex, drop и индексация в датафрейме 

### reindex 

In [48]:
df

Unnamed: 0,city,year,visits
0,Moscow,2022,500
1,Moscow,2021,2000
2,Moscow,2020,1500
3,Kazan,2022,100
4,Kazan,2021,230
5,Kazan,2020,200


In [50]:
df = df.reindex(['city', 'year'], axis = 1)
df

Unnamed: 0,city,year
0,Moscow,2022
1,Moscow,2021
2,Moscow,2020
3,Kazan,2022
4,Kazan,2021
5,Kazan,2020


In [66]:
df = df.reindex(['city', 'year', 'new_col'], axis = 1)
df

Unnamed: 0,city,year,new_col
0,Moscow,2022,
1,Moscow,2021,
2,Moscow,2020,
3,Kazan,2022,
4,Kazan,2021,
5,Kazan,2020,


In [54]:
df.reindex([0,4,5]) #axis = 0 по умолчанию

Unnamed: 0,city,year,new_col
0,Moscow,2022,
4,Kazan,2021,
5,Kazan,2020,


In [56]:
df.drop(3) #результат надо куда-то присваивать, т.к. он показывает новую копию

Unnamed: 0,city,year,new_col
0,Moscow,2022,
1,Moscow,2021,
2,Moscow,2020,
4,Kazan,2021,
5,Kazan,2020,


In [57]:
df.drop([0,2]) #axis = 0 по умолчанию

Unnamed: 0,city,year,new_col
1,Moscow,2021,
3,Kazan,2022,
4,Kazan,2021,
5,Kazan,2020,


In [67]:
df.drop('new_col', axis = 1, inplace = True)
# удаление в самом датафрейме

In [68]:
df

Unnamed: 0,city,year
0,Moscow,2022
1,Moscow,2021
2,Moscow,2020
3,Kazan,2022
4,Kazan,2021
5,Kazan,2020


### индексация в датафрейме 

In [82]:
df = pd.DataFrame(np.arange(16).reshape((4,4)), index=['Moscow', 'Vladivostok', 'Ufa', 'Kazan'], columns = ['col_1', 'col_2', 'col_3', 'col_4'])
df

Unnamed: 0,col_1,col_2,col_3,col_4
Moscow,0,1,2,3
Vladivostok,4,5,6,7
Ufa,8,9,10,11
Kazan,12,13,14,15


In [None]:


df['col_2']

In [74]:
df[['col_1', 'col_2']]

Unnamed: 0,col_1,col_2
Moscow,0,1
Vladivostok,4,5
Ufa,8,9
Kazan,12,13


In [75]:
df[0]

KeyError: 0

In [76]:
df[:2]

Unnamed: 0,col_1,col_2,col_3,col_4
Moscow,0,1,2,3
Vladivostok,4,5,6,7


In [77]:
df[df['col_3']>6]

Unnamed: 0,col_1,col_2,col_3,col_4
Ufa,8,9,10,11
Kazan,12,13,14,15


In [78]:
df[df<10] = 0 
df

Unnamed: 0,col_1,col_2,col_3,col_4
Moscow,0,0,0,0
Vladivostok,0,0,0,0
Ufa,0,0,10,11
Kazan,12,13,14,15


## 3.10 Операторы loc, iloc, at, iat. Сложение нескольких датафреймов
 

In [2]:
data = [['Ivan', 25, 4, 50, 1], 

        ['Petr', 40, 9, 250, 8], 

        ['Nikolay', 19, 12, 25, 1], 

        ['Sergey', 33, 6, 115, 6],

        ['Andrey', 38, 2, 152, 4],

        ['Ilya', 20, 18, 15, 2],

        ['Igor', 19, 2, 10, 1]]

_df = pd.DataFrame(data, columns=['name', 'age', 'clicks', 'balance', 'history'], index=list('abcdefg'))

In [3]:
_df['age']['c']

19

In [6]:
_df[['age', 'name']][2:3]

Unnamed: 0,age,name
c,19,Nikolay


### .loc 

In [9]:
_df.loc['e']

name       Andrey
age            38
clicks          2
balance       152
history         4
Name: e, dtype: object

In [11]:
_df.loc[['e', 'g'], 'name']

e    Andrey
g      Igor
Name: name, dtype: object

In [13]:
_df.loc[['e', 'g'], ['name', 'balance']]

Unnamed: 0,name,balance
e,Andrey,152
g,Igor,10


In [15]:
_df.loc[:, 'age']

a    25
b    40
c    19
d    33
e    38
f    20
g    19
Name: age, dtype: int64

In [18]:
_df.loc[_df['age'] > 30, ['name', 'age']]

Unnamed: 0,name,age
b,Petr,40
d,Sergey,33
e,Andrey,38


In [20]:
_df.loc['a' : 'd', ]

Unnamed: 0,name,age,clicks,balance,history
a,Ivan,25,4,50,1
b,Petr,40,9,250,8
c,Nikolay,19,12,25,1
d,Sergey,33,6,115,6


In [31]:
_df.loc[['a', 'c'], ['name', 'history']] = -1
_df

Unnamed: 0,name,age,clicks,balance,history
a,-1,25,4,50,-1
b,Petr,40,9,250,8
c,-1,19,12,25,-1
d,Sergey,33,6,115,6
e,Andrey,38,2,152,4
f,Ilya,20,18,15,2
g,Igor,19,2,10,1


### .iloc 

In [21]:
_df.iloc[0]

name       Ivan
age          25
clicks        4
balance      50
history       1
Name: a, dtype: object

In [24]:
_df.iloc[[0, 3], [0, 3]]

Unnamed: 0,name,balance
a,Ivan,50
d,Sergey,115


### .at 

In [25]:
_df['name']['d']

'Sergey'

In [27]:
_df.at['d', 'name']

'Sergey'

In [29]:
_df.at['b', 'age'] = 40
_df

Unnamed: 0,name,age,clicks,balance,history
a,Ivan,25,4,50,1
b,Petr,40,9,250,8
c,Nikolay,19,12,25,1
d,Sergey,33,6,115,6
e,Andrey,38,2,152,4
f,Ilya,20,18,15,2
g,Igor,19,2,10,1


In [43]:
_df.loc['a'] + _df.loc['d']

name       IvanSergey
age                58
clicks             10
balance           165
history             7
dtype: object

### .iat 

In [28]:
_df.iat[3, 0]

'Sergey'

### Сложение датафреймов

In [37]:
df1 = pd.DataFrame(np.arange(9).reshape(3,3), columns = list('bcd'), index = ['Moscow', 'Kazan', 'Vladivostok'])
df2 = pd.DataFrame(np.arange(12).reshape(4,3), columns = list('bde'), index = ['Yakutsk', 'Moscow', 'Kazan', 'Ufa'])

In [38]:
df1

Unnamed: 0,b,c,d
Moscow,0,1,2
Kazan,3,4,5
Vladivostok,6,7,8


In [39]:
df2

Unnamed: 0,b,d,e
Yakutsk,0,1,2
Moscow,3,4,5
Kazan,6,7,8
Ufa,9,10,11


In [40]:
df1 + df2

Unnamed: 0,b,c,d,e
Kazan,9.0,,12.0,
Moscow,3.0,,6.0,
Ufa,,,,
Vladivostok,,,,
Yakutsk,,,,


## 3.11 Сложение датафреймов, сортировки, арифметика с пропусками

### df + df with np.nan 

In [4]:
df1 = pd.DataFrame({'visits': [100, 200, 300]}, index = ['Kazan', 'Vladivostok', 'Moscow'])
df2 = pd.DataFrame({'visits': [400, np.nan, 230]}, index = ['Kazan', 'Vladivostok', 'Moscow'])


In [5]:
df1

Unnamed: 0,visits
Kazan,100
Vladivostok,200
Moscow,300


In [6]:
df2

Unnamed: 0,visits
Kazan,400.0
Vladivostok,
Moscow,230.0


In [7]:
df1 + df2 # теряем данные по Владивостоку

Unnamed: 0,visits
Kazan,500.0
Vladivostok,
Moscow,530.0


In [8]:
df1.add(df2, fill_value = 0)

Unnamed: 0,visits
Kazan,500.0
Vladivostok,200.0
Moscow,530.0


### df + s 

In [9]:
df1 = pd.DataFrame({'a': [100, 200, 300],
                    'b': [1, 2, 3]},
                     index = ['Kazan', 'Vladivostok', 'Moscow'])


In [10]:
df1

Unnamed: 0,a,b
Kazan,100,1
Vladivostok,200,2
Moscow,300,3


In [11]:
s1 = pd.Series([1, 2], index = ['a', 'b'])
s1

a    1
b    2
dtype: int64

In [12]:
df1 + s1

Unnamed: 0,a,b
Kazan,101,3
Vladivostok,201,4
Moscow,301,5


In [14]:
s2 = pd.Series([1], index = ['a'])
s2

a    1
dtype: int64

In [15]:
df1 + s2

Unnamed: 0,a,b
Kazan,101.0,
Vladivostok,201.0,
Moscow,301.0,


In [16]:
s3 = pd.Series([1, 2], index = ['Kazan', 'Moscow'])
s3

Kazan     1
Moscow    2
dtype: int64

In [17]:
df1 + s3

Unnamed: 0,Kazan,Moscow,a,b
Kazan,,,,
Vladivostok,,,,
Moscow,,,,


In [21]:
df1.add(s3, axis = 0)

Unnamed: 0,a,b
Kazan,101.0,2.0
Moscow,302.0,5.0
Vladivostok,,


### сортировка значений 

In [22]:
s1 = pd.Series([1,-2,3,-4], index = ['c', 'a', 'b', 'd'])
s1

c    1
a   -2
b    3
d   -4
dtype: int64

In [23]:
s1.sort_index()

a   -2
b    3
c    1
d   -4
dtype: int64

In [24]:
s1.sort_index(ascending = False)

d   -4
c    1
b    3
a   -2
dtype: int64

In [25]:
s1.sort_values()

d   -4
a   -2
c    1
b    3
dtype: int64

In [28]:
s1.sort_values(ascending = False, inplace = True)
s1

b    3
c    1
a   -2
d   -4
dtype: int64

In [34]:
df = pd.DataFrame(np.arange(12).reshape((4, 3)), columns = list('ebd'), index=['a', 'd', 'c', 'e'])
df

Unnamed: 0,e,b,d
a,0,1,2
d,3,4,5
c,6,7,8
e,9,10,11


In [36]:
df.sort_index()#axis=0

Unnamed: 0,e,b,d
a,0,1,2
c,6,7,8
d,3,4,5
e,9,10,11


In [41]:
df.sort_index(axis = 1)

Unnamed: 0,b,d,e
a,1,2,0
d,4,5,3
c,7,8,6
e,10,11,9


In [42]:
df.sort_values(by = 'e', ascending = False)

Unnamed: 0,e,b,d
e,9,10,11
c,6,7,8
d,3,4,5
a,0,1,2


In [44]:
s1.index.is_unique

True