In [1]:
import numpy as np
import pandas as pd

In [2]:
import datetime
from datetime import datetime, date

In [3]:
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 8)
pd.set_option('display.max_rows', 10)
pd.set_option('display.width', 65)

In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# sp500 = pd.read_csv('Data/sp500.csv', index_col='Symbol', usecols=[0, 2, 3, 7])

In [11]:
lmh_values = ['low', 'high', 'medium', 'medium', 'high']
lmh_cat = pd.Categorical(lmh_values)
lmh_cat

['low', 'high', 'medium', 'medium', 'high']
Categories (3, object): ['high', 'low', 'medium']

In [15]:
lmh_cat.categories.values

array(['high', 'low', 'medium'], dtype=object)

In [17]:
lmh_cat.codes

array([1, 0, 2, 2, 0], dtype=int8)

In [19]:
lmh_cat = pd.Categorical(lmh_values, categories=['low', 'medium', 'high'])

In [23]:
lmh_cat.categories

Index(['low', 'medium', 'high'], dtype='object')

In [24]:
lmh_cat.codes

array([0, 2, 1, 1, 2], dtype=int8)

In [25]:
lmh_cat.sort_values()

['low', 'medium', 'medium', 'high', 'high']
Categories (3, object): ['low', 'medium', 'high']

In [28]:
cat_series = pd.Series(lmh_values, dtype='category')
cat_series

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['high', 'low', 'medium']

In [30]:
s = pd.Series(lmh_values)
as_cat = s.astype('category')
as_cat

0       low
1      high
2    medium
3    medium
4      high
dtype: category
Categories (3, object): ['high', 'low', 'medium']

In [31]:
cat_series.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x000001DE7C2EB730>

In [32]:
as_cat.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x000001DE7B4A4B50>

In [36]:
cat_series.cat.categories

Index(['high', 'low', 'medium'], dtype='object')

In [41]:
np.random.seed(123456)
values = np.random.randint(0, 100, 5)
bins = pd.DataFrame({'Values': values})
bins

   Values
0      65
1      49
2      56
3      43
4      43

In [42]:
bins['Group'] = pd.cut(values, range(0, 101, 10))
bins

   Values     Group
0      65  (60, 70]
1      49  (40, 50]
2      56  (50, 60]
3      43  (40, 50]
4      43  (40, 50]

In [43]:
bins['Group']

0    (60, 70]
1    (40, 50]
2    (50, 60]
3    (40, 50]
4    (40, 50]
Name: Group, dtype: category
Categories (10, interval[int64, right]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

In [45]:
metal_values = ['bronze', 'gold', 'silver', 'bronze']
metal_categories = ['bronze', 'silver', 'gold']
metals = pd.Categorical(metal_values, categories=metal_categories, ordered=True)
metals

['bronze', 'gold', 'silver', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [56]:
metals_reversed_values = pd.Categorical(metals[::-1], categories=metals.categories, ordered=True)
metals_reversed_values

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'silver' < 'gold']

In [57]:
metals <= metals_reversed_values

array([ True, False,  True,  True])

In [63]:
pd.Categorical(['bronze', 'copper'], categories=metal_categories)

['bronze', NaN]
Categories (3, object): ['bronze', 'silver', 'gold']

In [7]:
pd.Categorical(list('addcb'), categories=['a', 'c'])

['a', NaN, NaN, 'c', NaN]
Categories (2, object): ['a', 'c']

In [10]:
cat = pd.Categorical(['a', 'b', 'c', 'a'], categories=['a', 'b', 'c'])

In [11]:
cat

['a', 'b', 'c', 'a']
Categories (3, object): ['a', 'b', 'c']

In [12]:
cat.categories = ['bronze', 'silver', 'gold']

In [13]:
cat

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze', 'silver', 'gold']

In [16]:
cat.rename_categories(['x', 'y', 'z'])

['x', 'y', 'z', 'x']
Categories (3, object): ['x', 'y', 'z']

In [19]:
metals = pd.Categorical(['bronze', 'silver', 'gold'], ordered=True)

In [20]:
metals

['bronze', 'silver', 'gold']
Categories (3, object): ['bronze' < 'gold' < 'silver']

In [21]:
metals.add_categories(['platinum'])

['bronze', 'silver', 'gold']
Categories (4, object): ['bronze' < 'gold' < 'silver' < 'platinum']

In [22]:
metals

['bronze', 'silver', 'gold']
Categories (3, object): ['bronze' < 'gold' < 'silver']

In [24]:
metals.remove_categories(['bronze'])

[NaN, 'silver', 'gold']
Categories (2, object): ['gold' < 'silver']

In [25]:
s = pd.Series(['one', 'two', 'four', 'five'], dtype='category')
s

0     one
1     two
2    four
3    five
dtype: category
Categories (4, object): ['five', 'four', 'one', 'two']

In [27]:
s = s.cat.set_categories(['one', 'four'])

In [28]:
s

0     one
1     NaN
2    four
3     NaN
dtype: category
Categories (2, object): ['one', 'four']

In [38]:
metals = pd.Categorical(['bronze', 'silver', 'gold', 'bronze'], ordered=True)
metals

['bronze', 'silver', 'gold', 'bronze']
Categories (3, object): ['bronze' < 'gold' < 'silver']

In [39]:
metals.describe()

            counts  freqs
categories               
bronze           2   0.50
gold             1   0.25
silver           1   0.25

In [40]:
metals.value_counts()

bronze    2
gold      1
silver    1
dtype: int64

In [41]:
(metals.min(), metals.max(), metals.mode())

  (metals.min(), metals.max(), metals.mode())


('bronze',
 'silver',
 ['bronze']
 Categories (3, object): ['bronze' < 'gold' < 'silver'])

In [43]:
metals_ser = pd.Series(metals)

In [44]:
(metals_ser.min(), metals_ser.max(), metals_ser.mode())

('bronze',
 'silver',
 0    bronze
 dtype: category
 Categories (3, object): ['bronze' < 'gold' < 'silver'])

In [5]:
np.random.seed(123456)
names = ['Ivana', 'Norris', 'Ruth', 'Lane', 'Skye', 'Sol', 
        'Dylan', 'Ratina', 'Alissa', 'Marc']
grades = np.random.randint(50, 101, len(names))
scores = pd.DataFrame({'Name': names, 'Grade': grades})
scores

     Name  Grade
0   Ivana     51
1  Norris     92
2    Ruth    100
3    Lane     99
4    Skye     93
5     Sol     97
6   Dylan     93
7  Ratina     77
8  Alissa     82
9    Marc     73

In [6]:
score_bins = [0, 59, 62, 66, 69, 72, 76, 79, 82, 86, 89, 92, 99, 100]
score_bins

[0, 59, 62, 66, 69, 72, 76, 79, 82, 86, 89, 92, 99, 100]

In [7]:
letter_grades = ['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']
letter_grades

['F', 'D-', 'D', 'D+', 'C-', 'C', 'C+', 'B-', 'B', 'B+', 'A-', 'A', 'A+']

In [8]:
letter_cats = pd.cut(scores['Grade'], score_bins, labels=letter_grades)
letter_cats

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Grade, dtype: category
Categories (13, object): ['F' < 'D-' < 'D' < 'D+' ... 'B+' < 'A-' < 'A' < 'A+']

In [9]:
scores['Letter'] = letter_cats
scores

     Name  Grade Letter
0   Ivana     51      F
1  Norris     92     A-
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
7  Ratina     77     C+
8  Alissa     82     B-
9    Marc     73      C

In [10]:
scores['Letter']

0     F
1    A-
2    A+
3     A
4     A
5     A
6     A
7    C+
8    B-
9     C
Name: Letter, dtype: category
Categories (13, object): ['F' < 'D-' < 'D' < 'D+' ... 'B+' < 'A-' < 'A' < 'A+']

In [11]:
scores['Letter'].value_counts()

A     4
F     1
C     1
C+    1
B-    1
     ..
D     0
D+    0
C-    0
B     0
B+    0
Name: Letter, Length: 13, dtype: int64

In [13]:
scores.sort_values(by=['Letter'], ascending=False)

     Name  Grade Letter
2    Ruth    100     A+
3    Lane     99      A
4    Skye     93      A
5     Sol     97      A
6   Dylan     93      A
1  Norris     92     A-
8  Alissa     82     B-
7  Ratina     77     C+
9    Marc     73      C
0   Ivana     51      F