In [1]:
import pandas as pd
import numpy as np

from datetime import datetime, date

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 80)

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
l = ['junior', 'middle', 'senior', 'junior']
l_cat = pd.Categorical(l, categories=['senior', 'middle', 'junior'])

l_cat

['junior', 'middle', 'senior', 'junior']
Categories (3, object): ['senior', 'middle', 'junior']

In [3]:
l_cat.categories

Index(['senior', 'middle', 'junior'], dtype='object')

In [4]:
l_cat.codes

array([2, 1, 0, 2], dtype=int8)

In [5]:
l_cat.sort_values()

['senior', 'middle', 'junior', 'junior']
Categories (3, object): ['senior', 'middle', 'junior']

In [6]:
s_cat = pd.Series(l_cat, dtype='category')

s_cat

0    junior
1    middle
2    senior
3    junior
dtype: category
Categories (3, object): ['senior', 'middle', 'junior']

In [7]:
s_cat = pd.Series(l)
s_cat = s_cat.astype('category')

print(s_cat)
print(s_cat.cat.categories)

0    junior
1    middle
2    senior
3    junior
dtype: category
Categories (3, object): ['junior', 'middle', 'senior']
Index(['junior', 'middle', 'senior'], dtype='object')


In [8]:
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Values': values})

bins

   Values
0      65
1      49
2      56
3      43
4      43

In [9]:
bins['Group'] = pd.cut(values, range(0, 101, 10))
bins

   Values     Group
0      65  (60, 70]
1      49  (40, 50]
2      56  (50, 60]
3      43  (40, 50]
4      43  (40, 50]

In [10]:
cl_cat = pd.Categorical(l, categories=['junior', 'middle', 'senior'], ordered=True)

cl_cat

['junior', 'middle', 'senior', 'junior']
Categories (3, object): ['junior' < 'middle' < 'senior']

In [11]:
cl_cat_revers = cl_cat.copy()[::-1]

print('cl_cat')
print(cl_cat.codes)
print('cl_cat_revers')
print(cl_cat_revers.codes)

cl_cat <= cl_cat_revers

cl_cat
[0 1 2 0]
cl_cat_revers
[0 2 1 0]


array([ True,  True, False,  True])

#### Переименование

In [12]:
s_cl_1 = pd.Categorical(['bronze', 'cooper'], categories=['bronze', 'silver', 'gold'])

s_cl_1

['bronze', NaN]
Categories (3, object): ['bronze', 'silver', 'gold']

In [14]:
cl_cat.categories = ['bronze', 'silver', 'gold']

cl_cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [15]:
cl_cat.rename_categories(['junior', 'middle', 'senior'])

['junior', 'middle', 'senior', 'junior']
Categories (3, object): ['junior' < 'middle' < 'senior']

In [16]:
cl_cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

#### Добавление

In [17]:
cl_cat.add_categories('platinum')

['bronze', 'silver', 'gold', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']

In [18]:
cl_cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

#### Удаление

In [19]:
cl_cat.remove_categories('bronze')

[NaN, 'silver', 'gold', NaN]
Categories (2, object): ['silver' < 'gold']

In [20]:
cl_cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [23]:
with_platinum = cl_cat.add_categories('platinum')
print(with_platinum)

cl_cat.remove_unused_categories()

['bronze', 'silver', 'gold', 'bronze']
Categories (4, object): ['bronze' < 'silver' < 'gold' < 'platinum']


['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

#### Установка

In [24]:
cl_cat.set_categories(['bottom', 'middle', 'top'])

[NaN, NaN, NaN, NaN]
Categories (3, object): ['bottom' < 'middle' < 'top']

In [25]:
cl_cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

#### Вычисление описательных статистик

In [31]:
cl_cat.describe()

            counts  freqs
categories               
bronze           2   0.50
silver           1   0.25
gold             1   0.25

In [32]:
cl_cat.value_counts()

bronze    2
silver    1
gold      1
dtype: int64

#### Пример - обработка школьных оценок

In [33]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 'Dylan', 'Katina', 'Alissa', 'Marc']
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'name': names, 'grade': grades})

scores

     name  grade
0   Ivana     51
1  Norris     92
2    Ruth    100
3    Lane     99
4    Skye     93
5     Sol     97
6   Dylan     93
7  Katina     77
8  Alissa     82
9    Marc     73

In [34]:
score_bins = [0, 59, 62, 66, 69, 72, 76, 79, 82, 86, 89, 92, 99, 100]
letter_grades = ['F', 'D-', 'D', 'D+', 'C-' , 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

letter_cats = pd.cut(scores['grade'], score_bins, labels=letter_grades)
scores['letter'] = letter_cats

scores

     name  grade letter
0   Ivana     51      F
1  Norris     92     A-
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
7  Katina     77     C+
8  Alissa     82     B-
9    Marc     73      C

In [36]:
scores.sort_values(by=['letter'], ascending=False)

     name  grade letter
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
1  Norris     92     A-
8  Alissa     82     B-
7  Katina     77     C+
9    Marc     73      C
0   Ivana     51      F