# Categorical Data


In [1]:
import pandas as pd
import numpy as np

In [2]:
fruit=pd.Series(['apple', 'orange','apple','apple','banana']*2)

In [3]:
fruit

0     apple
1    orange
2     apple
3     apple
4    banana
5     apple
6    orange
7     apple
8     apple
9    banana
dtype: object

In [4]:
pd.unique(fruit)

array(['apple', 'orange', 'banana'], dtype=object)

In [5]:
pd.value_counts(fruit)

  pd.value_counts(fruit)


apple     6
orange    2
banana    2
Name: count, dtype: int64

In [6]:
value=pd.Series([0,1,0,3,0]*2)

In [7]:
dim=pd.Series(['apple','orange','banana','abcd'])

In [8]:
dim.take(value)

0     apple
1    orange
0     apple
3      abcd
0     apple
0     apple
1    orange
0     apple
3      abcd
0     apple
dtype: object

In [16]:
n=len(fruit)

In [17]:
n

10

In [19]:
df=pd.DataFrame({'fruit':fruit,'b_id':np.arange(n),'count':np.random.randint(3,15,size=n),'weight':np.random.uniform(0,4,size=n)},columns=['b_id','fruit','count','weight'])

In [20]:
df

Unnamed: 0,b_id,fruit,count,weight
0,0,apple,14,0.152782
1,1,orange,5,3.873476
2,2,apple,4,1.706438
3,3,apple,8,0.155002
4,4,banana,11,0.690921
5,5,apple,11,0.627894
6,6,orange,12,3.289663
7,7,apple,13,2.812984
8,8,apple,13,0.282454
9,9,banana,5,0.872299


In [21]:
fruit_cat=df['fruit'].astype('category')

In [71]:
fruit_cat

0     apple
1    orange
2     apple
3     apple
4    banana
5     apple
6    orange
7     apple
8     apple
9    banana
Name: fruit, dtype: category
Categories (3, object): [apple, banana, orange]

In [72]:
c=fruit_cat.values


In [64]:
type(c)

pandas.core.arrays.categorical.Categorical

In [73]:
c.categories

Index(['apple', 'banana', 'orange'], dtype='object')

In [74]:
c.codes

array([0, 2, 0, 0, 1, 0, 2, 0, 0, 1], dtype=int8)

# covert dataframe column to categorical

In [75]:
df['fruit']=df['fruit'].astype('category')

In [76]:
df.fruit

0     apple
1    orange
2     apple
3     apple
4    banana
5     apple
6    orange
7     apple
8     apple
9    banana
Name: fruit, dtype: category
Categories (3, object): [apple, banana, orange]

In [21]:
my_categories=pd.Categorical(['foo','bar','baz','foo','bar'])

In [22]:
my_categories

[foo, bar, baz, foo, bar]
Categories (3, object): [bar, baz, foo]

In [23]:
catgo=['foo','bar','baz']

In [24]:
code=[0,1,2,0,0,1]

In [25]:
catego1=pd.Categorical.from_codes(code,catgo)

In [26]:
catego1

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

In [27]:
ordered_catgo=pd.Categorical.from_codes(code,catgo,ordered=True)

In [28]:
ordered_catgo

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

In [29]:
catego1

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo, bar, baz]

In [30]:
catego1.as_ordered()  #adding ordering

[foo, bar, baz, foo, foo, bar]
Categories (3, object): [foo < bar < baz]

#performance with categoricals

In [46]:
n=1000000

In [41]:
dra=pd.Series(np.random.randn(n))

In [43]:
dra.size

10

In [47]:
label=pd.Series(['foo','bar','baz','qux']*(n//4))

In [48]:
catgo=label.astype('category')

In [49]:
label.memory_usage()

8000128

In [50]:
catgo.memory_usage()

1000320

In [56]:
label.memory_usage(deep=True)

60000128

In [53]:
catgo.memory_usage(deep=True)

1000528

In [52]:
%time _ =label.astype('category')

Wall time: 67.3 ms


In [54]:
%timeit label.value_counts()

166 ms ± 28.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
%timeit catgo.value_counts()

12.8 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [42]:
t=pd.Series(['a','b','c','d']*2)

In [43]:
cat_t=t.astype('category')

In [44]:
cat_t

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): [a, b, c, d]

In [45]:
cat_t.cat.codes #cat method to provide access to categorical methods

0    0
1    1
2    2
3    3
4    0
5    1
6    2
7    3
dtype: int8

In [46]:
cat_t.cat.categories

Index(['a', 'b', 'c', 'd'], dtype='object')

In [47]:
actual_cat = ['a', 'b', 'c', 'd', 'e']


In [48]:
cat_t1=cat_t.cat.set_categories(actual_cat)

In [49]:
cat_t1

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (5, object): [a, b, c, d, e]

In [50]:
cat_t.value_counts()

d    2
c    2
b    2
a    2
dtype: int64

In [51]:
cat_t1.value_counts()

d    2
c    2
b    2
a    2
e    0
dtype: int64

In [52]:
cat_t3 = cat_t[cat_t.isin(['a', 'b'])]

In [53]:
cat_t3

0    a
1    b
4    a
5    b
dtype: category
Categories (4, object): [a, b, c, d]

In [54]:
cat_t3.cat.remove_unused_categories()

0    a
1    b
4    a
5    b
dtype: category
Categories (2, object): [a, b]

In [55]:
cat_s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')

In [56]:
pd.get_dummies(cat_s)


Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,1,0
7,0,0,0,1


 # group by

In [60]:
df=pd.DataFrame({'key':['a','b','d']*4,'value':np.arange(12)})

In [61]:
df

Unnamed: 0,key,value
0,a,0
1,b,1
2,d,2
3,a,3
4,b,4
5,d,5
6,a,6
7,b,7
8,d,8
9,a,9


In [62]:
grp=df.groupby('key').value

In [63]:
grp

<pandas.core.groupby.generic.SeriesGroupBy object at 0x00000203B3747850>

In [64]:
grp.mean()

key
a    4.5
b    5.5
d    6.5
Name: value, dtype: float64

In [65]:
grp.transform(lambda x:x.mean())

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [66]:
grp.transform('mean')

0     4.5
1     5.5
2     6.5
3     4.5
4     5.5
5     6.5
6     4.5
7     5.5
8     6.5
9     4.5
10    5.5
11    6.5
Name: value, dtype: float64

In [67]:
grp.transform(lambda x:x*3)

0      0
1      3
2      6
3      9
4     12
5     15
6     18
7     21
8     24
9     27
10    30
11    33
Name: value, dtype: int32

In [68]:
grp.transform(lambda x:x.rank())

0     1
1     1
2     1
3     2
4     2
5     2
6     3
7     3
8     3
9     4
10    4
11    4
Name: value, dtype: int32

In [92]:
def normalize(x):
    return(x-x.mean())/x.std()

In [93]:
grp.transform(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [94]:
grp.apply(normalize)

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

In [95]:
norm_df=(df['value']-grp.transform('mean'))/grp.transform('std')

In [96]:
norm_df

0    -1.161895
1    -1.161895
2    -1.161895
3    -0.387298
4    -0.387298
5    -0.387298
6     0.387298
7     0.387298
8     0.387298
9     1.161895
10    1.161895
11    1.161895
Name: value, dtype: float64

# method chaining 

In [3]:
import pandas as pd
df=pd.read_csv('Documents/ex1.csv')



In [4]:
df

Unnamed: 0,a,b,c,d,message
0,1,2,3,4,hello
1,5,6,7,8,world
2,9,10,11,12,foo


In [105]:
df1=df[df['b']>5]

In [107]:
df1

Unnamed: 0,a,b,c,d,message
1,5,6,7,8,world
2,9,10,11,12,foo


In [111]:
df1['col1_chan'] = df1['a']- df1['a'].mean()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df1['col1_chan'] = df1['a']- df1['a'].mean()


In [110]:
df1['a']- df1['a'].mean()

1   -2.0
2    2.0
Name: a, dtype: float64

In [112]:
df1=df.copy()

In [113]:
df1['a1']=df['a'].mean()

In [114]:
df1

Unnamed: 0,a,b,c,d,message,a1
0,1,2,3,4,hello,5.0
1,5,6,7,8,world,5.0
2,9,10,11,12,foo,5.0


In [118]:
df1=df.assign('a1'= df['a'].mean())

SyntaxError: expression cannot contain assignment, perhaps you meant "=="? (<ipython-input-118-37491e28d86f>, line 1)

In [119]:
result=(pd.read_csv('Documents/ex1.csv')
        lambda x: x.'b'>5])


SyntaxError: invalid syntax (<ipython-input-119-d5bc696220aa>, line 2)

In [127]:
import pandas as pd
bins=[0, 3,5, 7]
labels=['Short', 'medium', 'long']
res = (
  pd.read_csv('documents/iris.csv')
    .query('species == "setosa"')
    .assign(petal_length = lambda df: pd.cut(df['sepal_length'], bins=bins, labels=labels))
)
res.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,5.1,3.5,long,0.2,setosa,,,,,
1,4.9,3.0,medium,0.2,setosa,,,,,
2,4.7,3.2,medium,0.2,setosa,,,,,
3,4.6,3.1,medium,0.2,setosa,,,,,
4,5.0,3.6,medium,0.2,setosa,,5.2,4.1,1.8,0.3


In [5]:
(df.dropna(subset=['dep_time', 'unique_carrier'])
   .loc[df['unique_carrier']
       .isin(df['unique_carrier'].value_counts().index[:5])]
   .set_index('dep_time')
   # TimeGrouper to resample & groupby at once
   .groupby(['unique_carrier', pd.TimeGrouper("H")])
   .fl_num.count()
   .unstack(0)
   .fillna(0)
   .rolling(24)
   .sum()
   .rename_axis("Flights per Day", axis=1)
   .plot()
)

KeyError: ['dep_time', 'unique_carrier']

In [128]:
 pd.read_csv('documents/iris.csv')
 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,5.1,3.5,1.4,0.2,setosa,,,,,
1,4.9,3.0,1.4,0.2,setosa,,,,,
2,4.7,3.2,1.3,0.2,setosa,,,,,
3,4.6,3.1,1.5,0.2,setosa,,,,,
4,5.0,3.6,1.4,0.2,setosa,,5.2,4.1,1.8,0.3
...,...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,,,,,
146,6.3,2.5,5.0,1.9,virginica,,,,,
147,6.5,3.0,5.2,2.0,virginica,,,,,
148,6.2,3.4,5.4,2.3,virginica,,,,,
