In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame(np.random.randn(8, 3), columns=['A', 'B', 'C'])

In [4]:
df.columns = [x.lower() for x in df.columns]
df

Unnamed: 0,a,b,c
0,1.908265,0.179872,0.512636
1,-0.327163,-1.444348,0.255928
2,-0.923171,-1.600893,-0.39435
3,-0.055932,1.452695,1.430201
4,-0.146612,-1.594281,1.741658
5,0.199626,1.947924,0.635331
6,0.012215,-1.322598,1.656495
7,-0.76177,-0.733743,2.097798


In [5]:
df.a.array

<PandasArray>
[  1.9082648945918925,  -0.3271629042754731,  -0.9231714709916108,
 -0.05593221740226437, -0.14661213282529156,  0.19962616239869746,
  0.01221456747248198,  -0.7617697525375365]
Length: 8, dtype: float64

In [6]:
data = np.random.randint(0, 7, size=50)

In [7]:
data

array([5, 0, 1, 5, 1, 5, 0, 1, 2, 5, 2, 3, 5, 2, 3, 6, 4, 5, 5, 0, 6, 2,
       2, 4, 1, 6, 2, 5, 2, 4, 2, 6, 2, 5, 4, 3, 0, 5, 5, 4, 5, 4, 5, 0,
       6, 1, 6, 4, 3, 5])

In [8]:
s = pd.Series(data)

In [10]:
s.value_counts()

5    14
2     9
4     7
6     6
0     5
1     5
3     4
dtype: int64

In [11]:
s5 = pd.Series([1, 1, 3, 3, 3, 5, 5, 7, 7, 7])
s5.mode()

0    3
1    7
dtype: int64

In [12]:
df5 = pd.DataFrame({
    'a': np.random.randint(0, 7, size=50),
    'B': np.random.randint(-10, 15, size=50)
})

In [13]:
df5.mode()

Unnamed: 0,a,B
0,4.0,-3
1,,3
2,,4
3,,10


In [14]:
# reindexing (assigns NaN for labels that have no data)
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])


In [15]:
s

a    0.075309
b    0.043612
c   -0.256028
d    1.186242
e   -1.181105
dtype: float64

In [16]:
s.reindex(['e', 'b', 'f', 'd'])

e   -1.181105
b    0.043612
f         NaN
d    1.186242
dtype: float64

In [17]:
# now reindexing dataframes
df = pd.DataFrame({
    'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
    'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
    'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])
})

In [18]:
df.reindex(index=['c', 'f', 'b'], columns=['three', 'two', 'one'])

Unnamed: 0,three,two,one
c,-1.937599,0.006493,-0.019842
f,,,
b,0.217714,0.030172,-0.042156


In [19]:
# dropping labels from an axis
df

Unnamed: 0,one,two,three
a,0.662815,0.110054,
b,-0.042156,0.030172,0.217714
c,-0.019842,0.006493,-1.937599
d,,1.290206,-0.4426


In [20]:
df.drop(['a', 'd'], axis=0)

Unnamed: 0,one,two,three
b,-0.042156,0.030172,0.217714
c,-0.019842,0.006493,-1.937599


In [21]:
df.drop(['one'], axis=1)

Unnamed: 0,two,three
a,0.110054,
b,0.030172,0.217714
c,0.006493,-1.937599
d,1.290206,-0.4426


In [22]:
# renaming
s

a    0.075309
b    0.043612
c   -0.256028
d    1.186242
e   -1.181105
dtype: float64

In [23]:
s.rename(str.upper)

A    0.075309
B    0.043612
C   -0.256028
D    1.186242
E   -1.181105
dtype: float64

In [24]:
# can use dict or series to rename
df.rename(columns={'one': 'foo', 'two': 'bar'},
         index={'a': 'apple', 'b': 'banana', 'd': 'durian'})

Unnamed: 0,foo,bar,three
apple,0.662815,0.110054,
banana,-0.042156,0.030172,0.217714
c,-0.019842,0.006493,-1.937599
durian,,1.290206,-0.4426


In [25]:
df.rename({'one': 'foo', 'two': 'bar'}, axis='columns')

Unnamed: 0,foo,bar,three
a,0.662815,0.110054,
b,-0.042156,0.030172,0.217714
c,-0.019842,0.006493,-1.937599
d,,1.290206,-0.4426


In [26]:
df.rename({'a': 'apple', 'b': 'banana', 'd': 'durian'}, axis='index')

Unnamed: 0,one,two,three
apple,0.662815,0.110054,
banana,-0.042156,0.030172,0.217714
c,-0.019842,0.006493,-1.937599
durian,,1.290206,-0.4426


In [27]:
# .dt and .str accessors
# .dt is used to return datetime-like properties for the values 
# datetime
s = pd.Series(pd.date_range('20130101 09:10:12', periods=4))
s

0   2013-01-01 09:10:12
1   2013-01-02 09:10:12
2   2013-01-03 09:10:12
3   2013-01-04 09:10:12
dtype: datetime64[ns]

In [28]:
s.dt.hour

0    9
1    9
2    9
3    9
dtype: int64

In [29]:
s.dt.second

0    12
1    12
2    12
3    12
dtype: int64

In [31]:
s.dt.day

0    1
1    2
2    3
3    4
dtype: int64

In [32]:
s.dt.dayofweek

0    1
1    2
2    3
3    4
dtype: int64

In [33]:
# timezone setting
stz = s.dt.tz_localize('US/Eastern')
stz

0   2013-01-01 09:10:12-05:00
1   2013-01-02 09:10:12-05:00
2   2013-01-03 09:10:12-05:00
3   2013-01-04 09:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [34]:
stz.dt.tz

<DstTzInfo 'US/Eastern' LMT-1 day, 19:04:00 STD>

In [35]:
# can also chain timezone operations
s.dt.tz_localize('UTC').dt.tz_convert('US/Eastern')

0   2013-01-01 04:10:12-05:00
1   2013-01-02 04:10:12-05:00
2   2013-01-03 04:10:12-05:00
3   2013-01-04 04:10:12-05:00
dtype: datetime64[ns, US/Eastern]

In [36]:
# .str accessor = set of string processing methods
s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog'],
             dtype='string')
s.str.lower()


0       a
1       b
2       c
3    aaba
4    baca
5    <NA>
6    caba
7     dog
dtype: string

In [37]:
# sorting by index
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})

In [38]:
unsorted_df = df.reindex(index=['a', 'd', 'c', 'b'],
                        columns=['three', 'two', 'one'])

In [39]:
unsorted_df

Unnamed: 0,three,two,one
a,,0.329328,-0.94084
d,-1.042015,-1.547693,
c,0.606485,-0.806758,-0.590694
b,2.179099,-0.004951,-0.816333


In [40]:
unsorted_df.sort_index()

Unnamed: 0,three,two,one
a,,0.329328,-0.94084
b,2.179099,-0.004951,-0.816333
c,0.606485,-0.806758,-0.590694
d,-1.042015,-1.547693,


In [41]:
unsorted_df.sort_index(ascending=False)

Unnamed: 0,three,two,one
d,-1.042015,-1.547693,
c,0.606485,-0.806758,-0.590694
b,2.179099,-0.004951,-0.816333
a,,0.329328,-0.94084


In [42]:
unsorted_df.sort_index(axis=1)

Unnamed: 0,one,three,two
a,-0.94084,,0.329328
d,,-1.042015,-1.547693
c,-0.590694,0.606485,-0.806758
b,-0.816333,2.179099,-0.004951


In [44]:
unsorted_df['three'].sort_index()

a         NaN
b    2.179099
c    0.606485
d   -1.042015
Name: three, dtype: float64