In [111]:
import pandas as pd
import numpy as np

#### Создание Series 

In [112]:
a = [2, 5, 7, 8, 3]

In [113]:
b = pd.Series(a)

In [114]:
b

0    2
1    5
2    7
3    8
4    3
dtype: int64

In [115]:
b = pd.Series(a, index = ['a', 'b', 'c', 'd', 'e'])
b

a    2
b    5
c    7
d    8
e    3
dtype: int64

In [116]:
from datetime import date

In [117]:
ind = [date(y, m, d) for y, m, d in [(2018, 1, 15),(2018, 1, 18),(2018, 1, 20),(2018, 1, 25),(2018, 1, 28)]]

In [118]:
b = pd.Series(a, index = ind)
b

2018-01-15    2
2018-01-18    5
2018-01-20    7
2018-01-25    8
2018-01-28    3
dtype: int64

In [119]:
b.index

Index([2018-01-15, 2018-01-18, 2018-01-20, 2018-01-25, 2018-01-28], dtype='object')

In [120]:
b.index[0].year

2018

In [121]:
b.index[0].month

1

In [122]:
b.index[0].day

15

In [123]:
b.index = pd.to_datetime(b.index, format = '%Y-%m-%d')

In [124]:
b.index

DatetimeIndex(['2018-01-15', '2018-01-18', '2018-01-20', '2018-01-25',
               '2018-01-28'],
              dtype='datetime64[ns]', freq=None)

In [125]:
b.index.year

Int64Index([2018, 2018, 2018, 2018, 2018], dtype='int64')

In [126]:
b.index.month

Int64Index([1, 1, 1, 1, 1], dtype='int64')

In [127]:
b.index.day

Int64Index([15, 18, 20, 25, 28], dtype='int64')

In [128]:
# Индексы могут быть неуникальными
b = pd.Series(a, index = [0, 1, 0, 2, 1])
b

0    2
1    5
0    7
2    8
1    3
dtype: int64

In [129]:
# Можно поменять индексы
b.index = [10, 11, 12, 13, 14]
b

10    2
11    5
12    7
13    8
14    3
dtype: int64

In [130]:
# Задать тип данных
b = pd.Series(a, dtype = np.float64)
b

0    2.0
1    5.0
2    7.0
3    8.0
4    3.0
dtype: float64

In [131]:
# Или поменять тип дынных позде с помощью astype
b = pd.Series(a)
b = b.astype(np.float64)
b

0    2.0
1    5.0
2    7.0
3    8.0
4    3.0
dtype: float64

In [132]:
# Создать Series из словаря
d = {'1st':'a', '2nd':'b', '3rd':'c', '4th':'d', '5th':'e'}
b = pd.Series(d)
b

1st    a
2nd    b
3rd    c
4th    d
5th    e
dtype: object

#### Просмотр данных

In [133]:
b = pd.Series([5, 2, 6, 1, 6, 8, 7])

In [134]:
# Просмотреть индексы
b.index

RangeIndex(start=0, stop=7, step=1)

In [135]:
# Просмотреть данные
b.values

array([5, 2, 6, 1, 6, 8, 7], dtype=int64)

In [136]:
b

0    5
1    2
2    6
3    1
4    6
5    8
6    7
dtype: int64

In [137]:
b[0]

5

In [138]:
b[2]

6

In [139]:
b[[0, 6]]

0    5
6    7
dtype: int64

In [140]:
b.head()

0    5
1    2
2    6
3    1
4    6
dtype: int64

In [141]:
b.head(3)

0    5
1    2
2    6
dtype: int64

In [142]:
b.tail()

2    6
3    1
4    6
5    8
6    7
dtype: int64

In [143]:
b.tail(1)

6    7
dtype: int64

In [144]:
# Выборка по условию
b

0    5
1    2
2    6
3    1
4    6
5    8
6    7
dtype: int64

In [145]:
b[b > 5]

2    6
4    6
5    8
6    7
dtype: int64

In [146]:
b[(b == 7) | (b % 3 == 0)]

2    6
4    6
6    7
dtype: int64

#### Изменение элементов

In [147]:
b = pd.Series([5, 2, 6, 1, 6, 8, 7])

In [148]:
b

0    5
1    2
2    6
3    1
4    6
5    8
6    7
dtype: int64

In [149]:
b[0] = 4

In [150]:
b

0    4
1    2
2    6
3    1
4    6
5    8
6    7
dtype: int64

In [151]:
b[b < 5] = 0

In [152]:
b

0    0
1    0
2    6
3    0
4    6
5    8
6    7
dtype: int64

In [153]:
b[[0, 1, 2]] = 1

In [154]:
b

0    1
1    1
2    1
3    0
4    6
5    8
6    7
dtype: int64

#### Добавление данных

In [155]:
b = b.append(pd.Series({6:10, 7:15, 8:11, 9:14}))
b

0     1
1     1
2     1
3     0
4     6
5     8
6     7
6    10
7    15
8    11
9    14
dtype: int64

#### Удаление данных по индексу

In [156]:
b = b.drop([0, 1, 2])
b

3     0
4     6
5     8
6     7
6    10
7    15
8    11
9    14
dtype: int64

#### Запись и чтение данных из файла

In [157]:
b.to_pickle('b.pkl')

In [158]:
b2 = pd.read_pickle('b.pkl')
b2

3     0
4     6
5     8
6     7
6    10
7    15
8    11
9    14
dtype: int64

#### Создание DataFrame 

In [159]:
df = pd.DataFrame({'Col1':['a','b','c','d','e','f','g', 'h'],
                   'Col2':[1, 3, 5, 7, 9, 11, 13, 15]}, columns = ['Col1', 'Col2'])

In [160]:
df

Unnamed: 0,Col1,Col2
0,a,1
1,b,3
2,c,5
3,d,7
4,e,9
5,f,11
6,g,13
7,h,15


#### Просмотр информации о DataFrame 

In [161]:
df.shape

(8, 2)

In [162]:
df.columns

Index(['Col1', 'Col2'], dtype='object')

In [163]:
df.index

RangeIndex(start=0, stop=8, step=1)

In [164]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 2 columns):
Col1    8 non-null object
Col2    8 non-null int64
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [165]:
df.describe()

Unnamed: 0,Col2
count,8.0
mean,8.0
std,4.898979
min,1.0
25%,4.5
50%,8.0
75%,11.5
max,15.0


In [166]:
df.head()

Unnamed: 0,Col1,Col2
0,a,1
1,b,3
2,c,5
3,d,7
4,e,9


In [167]:
df.head(2)

Unnamed: 0,Col1,Col2
0,a,1
1,b,3


In [168]:
df['Col1'].head()

0    a
1    b
2    c
3    d
4    e
Name: Col1, dtype: object

In [169]:
df.Col1.head()

0    a
1    b
2    c
3    d
4    e
Name: Col1, dtype: object

In [170]:
df.tail()

Unnamed: 0,Col1,Col2
3,d,7
4,e,9
5,f,11
6,g,13
7,h,15


#### Изменение индекса 

In [171]:
df.index = [2, 4, 6, 8, 10, 12, 14, 16]

In [172]:
df

Unnamed: 0,Col1,Col2
2,a,1
4,b,3
6,c,5
8,d,7
10,e,9
12,f,11
14,g,13
16,h,15


#### Выбор данных по индексу 

In [173]:
df.loc[2, 'Col1']

'a'

In [174]:
df.loc[2, :]

Col1    a
Col2    1
Name: 2, dtype: object

In [175]:
df.loc[2:4, :]

Unnamed: 0,Col1,Col2
2,a,1
4,b,3


#### Выбор данных по позиции 

In [176]:
df.iloc[0, :]

Col1    a
Col2    1
Name: 2, dtype: object

In [177]:
df.iloc[0:2, :]

Unnamed: 0,Col1,Col2
2,a,1
4,b,3


In [178]:
df.iloc[0:2, 0]

2    a
4    b
Name: Col1, dtype: object

#### Выбор по условию 

In [179]:
df.loc[df['Col1'] == 'b', :]

Unnamed: 0,Col1,Col2
4,b,3


In [180]:
df.loc[df['Col1'] == 'b', 'Col2']

4    3
Name: Col2, dtype: int64

In [181]:
df.loc[df['Col1'] == 'b', 'Col2'].values

array([3], dtype=int64)

In [182]:
df.loc[df['Col2'] > 10, 'Col1']

12    f
14    g
16    h
Name: Col1, dtype: object

In [183]:
df.loc[(df['Col2'] > 10) & (df['Col1'] != 'g'), :]

Unnamed: 0,Col1,Col2
12,f,11
16,h,15


In [184]:
df.loc[df['Col2'].between(11, 13), :]

Unnamed: 0,Col1,Col2
12,f,11
14,g,13


In [185]:
df.loc[df['Col1'].isin(['a', 'b', 'c', 'd', 'e']), :]

Unnamed: 0,Col1,Col2
2,a,1
4,b,3
6,c,5
8,d,7
10,e,9


In [186]:
df.loc[~df['Col1'].isin(['a', 'b', 'c', 'd', 'e']), :]

Unnamed: 0,Col1,Col2
12,f,11
14,g,13
16,h,15


In [187]:
df

Unnamed: 0,Col1,Col2
2,a,1
4,b,3
6,c,5
8,d,7
10,e,9
12,f,11
14,g,13
16,h,15


In [188]:
df.query('Col1 == "b"')

Unnamed: 0,Col1,Col2
4,b,3


In [189]:
df.query('Col2 > 10')

Unnamed: 0,Col1,Col2
12,f,11
14,g,13
16,h,15


#### Столбец DataFrame в виде Series 

In [190]:
s = df['Col1']
s

2     a
4     b
6     c
8     d
10    e
12    f
14    g
16    h
Name: Col1, dtype: object

In [191]:
s.index

Int64Index([2, 4, 6, 8, 10, 12, 14, 16], dtype='int64')

In [192]:
s.values

array(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'], dtype=object)

In [193]:
type(s)

pandas.core.series.Series

#### Получение DataFrame из Series 

In [194]:
df2 = pd.DataFrame(s)
df2

Unnamed: 0,Col1
2,a
4,b
6,c
8,d
10,e
12,f
14,g
16,h


#### Копирование DataFrame 

In [195]:
df_copy = df.copy()
df_copy

Unnamed: 0,Col1,Col2
2,a,1
4,b,3
6,c,5
8,d,7
10,e,9
12,f,11
14,g,13
16,h,15


#### Случайный выбор n-го количества строк 

In [196]:
df.sample(n = 2)

Unnamed: 0,Col1,Col2
16,h,15
12,f,11


#### Случайный выбор доли от исходного DataFrame 

In [197]:
df.sample(frac = 0.5)

Unnamed: 0,Col1,Col2
14,g,13
2,a,1
10,e,9
12,f,11


#### Случайный выбор с возвращением (строки могут повторяться) 

In [198]:
df.sample(frac = 0.5, replace = True)

Unnamed: 0,Col1,Col2
2,a,1
6,c,5
10,e,9
12,f,11


In [199]:
df.sample(frac = 1, random_state = 42)

Unnamed: 0,Col1,Col2
4,b,3
12,f,11
2,a,1
16,h,15
6,c,5
10,e,9
8,d,7
14,g,13


#### Запись и чтение DataFrame в csv 

In [200]:
df.to_csv('Test.scv', sep = ';', index = False)

In [201]:
df_new = pd.read_csv('Test.scv', sep = ';')

In [202]:
df_new

Unnamed: 0,Col1,Col2
0,a,1
1,b,3
2,c,5
3,d,7
4,e,9
5,f,11
6,g,13
7,h,15


####  Слияние данных

In [203]:
authors = pd.DataFrame({'author_id': [1, 2, 3], 'author_name': ['Pushkin','Tolstoy','Dostoevsky']}, columns = ['author_id', 'author_name'])

In [204]:
authors

Unnamed: 0,author_id,author_name
0,1,Pushkin
1,2,Tolstoy
2,3,Dostoevsky


In [205]:
books = pd.DataFrame({'author_id': [2, 3, 3, 4], 'book_title': ['War and Pease', 'The Idiot', 'Crime and Punishment', 'Fathers and Sons']})

In [206]:
books

Unnamed: 0,author_id,book_title
0,2,War and Pease
1,3,The Idiot
2,3,Crime and Punishment
3,4,Fathers and Sons


In [207]:
df1 = pd.merge(authors, books, on = 'author_id', how = 'inner')
df1

Unnamed: 0,author_id,author_name,book_title
0,2,Tolstoy,War and Pease
1,3,Dostoevsky,The Idiot
2,3,Dostoevsky,Crime and Punishment


In [208]:
df2 = pd.merge(authors, books, on = 'author_id', how = 'left')
df2

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,
1,2,Tolstoy,War and Pease
2,3,Dostoevsky,The Idiot
3,3,Dostoevsky,Crime and Punishment


In [209]:
df3 = pd.merge(authors, books, on = 'author_id', how = 'right')
df3

Unnamed: 0,author_id,author_name,book_title
0,2,Tolstoy,War and Pease
1,3,Dostoevsky,The Idiot
2,3,Dostoevsky,Crime and Punishment
3,4,,Fathers and Sons


In [210]:
df4 = pd.merge(authors, books, on = 'author_id', how = 'outer')
df4

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,
1,2,Tolstoy,War and Pease
2,3,Dostoevsky,The Idiot
3,3,Dostoevsky,Crime and Punishment
4,4,,Fathers and Sons


#### Работа с пропущенными данными 

In [211]:
df4.loc[df4['book_title'].isnull(), :]

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,


In [212]:
df4.loc[df4['author_name'].notnull(), :]

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,
1,2,Tolstoy,War and Pease
2,3,Dostoevsky,The Idiot
3,3,Dostoevsky,Crime and Punishment


In [213]:
df4['book_title'] = df4['book_title'].fillna('unknown')
df4

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,unknown
1,2,Tolstoy,War and Pease
2,3,Dostoevsky,The Idiot
3,3,Dostoevsky,Crime and Punishment
4,4,,Fathers and Sons


In [214]:
df4['author_name'].fillna('unknown', inplace = True)
df4

Unnamed: 0,author_id,author_name,book_title
0,1,Pushkin,unknown
1,2,Tolstoy,War and Pease
2,3,Dostoevsky,The Idiot
3,3,Dostoevsky,Crime and Punishment
4,4,unknown,Fathers and Sons


#### Добавление столбцов 

In [215]:
df4.loc[(df4['author_name'] != 'unknown') & (df4['book_title'] != 'book_title'), 'quantity'] = 1
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1.0
1,2,Tolstoy,War and Pease,1.0
2,3,Dostoevsky,The Idiot,1.0
3,3,Dostoevsky,Crime and Punishment,1.0
4,4,unknown,Fathers and Sons,


In [216]:
df4['quantity'].fillna(0, inplace = True)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1.0
1,2,Tolstoy,War and Pease,1.0
2,3,Dostoevsky,The Idiot,1.0
3,3,Dostoevsky,Crime and Punishment,1.0
4,4,unknown,Fathers and Sons,0.0


In [217]:
df4['quantity'] = df4['quantity'].astype(int)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,2,Tolstoy,War and Pease,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [218]:
df4.set_index('author_id', inplace = True)
df4

Unnamed: 0_level_0,author_name,book_title,quantity
author_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Pushkin,unknown,1
2,Tolstoy,War and Pease,1
3,Dostoevsky,The Idiot,1
3,Dostoevsky,Crime and Punishment,1
4,unknown,Fathers and Sons,0


In [219]:
df4.reset_index(inplace = True)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,2,Tolstoy,War and Pease,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


#### Удаление данных 

In [220]:
df4['price'] = 500
df4

Unnamed: 0,author_id,author_name,book_title,quantity,price
0,1,Pushkin,unknown,1,500
1,2,Tolstoy,War and Pease,1,500
2,3,Dostoevsky,The Idiot,1,500
3,3,Dostoevsky,Crime and Punishment,1,500
4,4,unknown,Fathers and Sons,0,500


In [221]:
# Удаление столбца
df4 = df4.drop('price', axis = 1)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,2,Tolstoy,War and Pease,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


In [222]:
# Удаление строки
df4 = df4.drop(1, axis = 0)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


#### Сортировка 

In [224]:
# Добавим удаленную строку
df4 = df4.append(pd.DataFrame({'author_id':[2], 
                               'author_name': ['Tolstoy'],
                               'book_title': ['War and Pease'],
                               'quantity': [1]},
                               columns = ['author_id','author_name','book_title','quantity']),
                               ignore_index = True)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,3,Dostoevsky,The Idiot,1
2,3,Dostoevsky,Crime and Punishment,1
3,4,unknown,Fathers and Sons,0
4,2,Tolstoy,War and Pease,1


In [225]:
# Отсортируем по столбцу author_id
df4 = df4.sort_values(by = 'author_id')
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
4,2,Tolstoy,War and Pease,1
1,3,Dostoevsky,The Idiot,1
2,3,Dostoevsky,Crime and Punishment,1
3,4,unknown,Fathers and Sons,0


In [226]:
# Переустановим индекс
df4 = df4.reset_index(drop = True)
df4

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,2,Tolstoy,War and Pease,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0


#### Соединение датафреймов 

In [227]:
df5 = pd.DataFrame({'author_id':[3, 5], 'author_name':['Dostoevsky', 'Chekhov'], 
                    'book_title':['The Gambler', 'Three Sisters'], 'quantity': [2, 3]}, 
                  columns = ['author_id','author_name','book_title','quantity'])
df5

Unnamed: 0,author_id,author_name,book_title,quantity
0,3,Dostoevsky,The Gambler,2
1,5,Chekhov,Three Sisters,3


In [228]:
df6 = pd.concat([df4, df5], axis=0, ignore_index = True)
df6

Unnamed: 0,author_id,author_name,book_title,quantity
0,1,Pushkin,unknown,1
1,2,Tolstoy,War and Pease,1
2,3,Dostoevsky,The Idiot,1
3,3,Dostoevsky,Crime and Punishment,1
4,4,unknown,Fathers and Sons,0
5,3,Dostoevsky,The Gambler,2
6,5,Chekhov,Three Sisters,3


In [229]:
prices = pd.DataFrame({'price': [600, 500, 450, 550, 700]}, columns = ['price'], index = [1, 2, 3, 5, 6])
prices

Unnamed: 0,price
1,600
2,500
3,450
5,550
6,700


In [230]:
df7 = pd.concat([df6, prices], axis=1)
df7

Unnamed: 0,author_id,author_name,book_title,quantity,price
0,1,Pushkin,unknown,1,
1,2,Tolstoy,War and Pease,1,600.0
2,3,Dostoevsky,The Idiot,1,500.0
3,3,Dostoevsky,Crime and Punishment,1,450.0
4,4,unknown,Fathers and Sons,0,
5,3,Dostoevsky,The Gambler,2,550.0
6,5,Chekhov,Three Sisters,3,700.0


#### Применение функций и методов

In [232]:
df7['total'] = df7['quantity'] * df7['price']
df7

Unnamed: 0,author_id,author_name,book_title,quantity,price,total
0,1,Pushkin,unknown,1,,
1,2,Tolstoy,War and Pease,1,600.0,600.0
2,3,Dostoevsky,The Idiot,1,500.0,500.0
3,3,Dostoevsky,Crime and Punishment,1,450.0,450.0
4,4,unknown,Fathers and Sons,0,,
5,3,Dostoevsky,The Gambler,2,550.0,1100.0
6,5,Chekhov,Three Sisters,3,700.0,2100.0


In [233]:
df7['price'].max()

700.0

In [234]:
df7['price'].min()

450.0

In [235]:
df7['price'].mean()

560.0

In [236]:
df7['price'].median()

550.0

In [237]:
df7['price'].std()

96.17692030835673

In [238]:
df7['price'].var()

9250.0

In [240]:
df7.nlargest(3, 'price')

Unnamed: 0,author_id,author_name,book_title,quantity,price,total
6,5,Chekhov,Three Sisters,3,700.0,2100.0
1,2,Tolstoy,War and Pease,1,600.0,600.0
5,3,Dostoevsky,The Gambler,2,550.0,1100.0


In [241]:
df7['author_name'].unique()

array(['Pushkin', 'Tolstoy', 'Dostoevsky', 'unknown', 'Chekhov'],
      dtype=object)

In [242]:
df7['author_name'].nunique()

5

In [243]:
df7['author_name'].value_counts()

Dostoevsky    3
unknown       1
Pushkin       1
Chekhov       1
Tolstoy       1
Name: author_name, dtype: int64

In [245]:
df7['book_title'].apply(lambda x: x.upper())

0                 UNKNOWN
1           WAR AND PEASE
2               THE IDIOT
3    CRIME AND PUNISHMENT
4        FATHERS AND SONS
5             THE GAMBLER
6           THREE SISTERS
Name: book_title, dtype: object

#### Группировка данных 

In [246]:
# 1-й способ
df7.groupby('author_name')['price'].max()

author_name
Chekhov       700.0
Dostoevsky    550.0
Pushkin         NaN
Tolstoy       600.0
unknown         NaN
Name: price, dtype: float64

In [248]:
# 2-й способ
price_agg = df7.groupby('author_name').agg({'price':'max'})
price_agg

Unnamed: 0_level_0,price
author_name,Unnamed: 1_level_1
Chekhov,700.0
Dostoevsky,550.0
Pushkin,
Tolstoy,600.0
unknown,


In [250]:
price_agg = price_agg.reset_index()
price_agg = price_agg.rename(columns={'price':'max_price'})
price_agg

Unnamed: 0,index,author_name,max_price
0,0,Chekhov,700.0
1,1,Dostoevsky,550.0
2,2,Pushkin,
3,3,Tolstoy,600.0
4,4,unknown,
