In [2]:
import numpy as np
import pandas as pd

## DataFrame Practice

In [5]:
# Sample Datas
data = {"names": ["sunyoung", "sunyoung", "sunyoung", "jungjae", "jungjae"],
           "year": [2018, 2017, 2016, 2015, 2018],
           "points": [2.9, 3.7, 4.6, 5.4, 6.9]}
df = pd.DataFrame(data, columns=["year", "names", "points", "penalty"],
                          index=["one", "two", "three", "four", "five"])

In [9]:
print(data)
print("This is now DataFrame based on data")
print(df)

{'names': ['sunyoung', 'sunyoung', 'sunyoung', 'jungjae', 'jungjae'], 'year': [2018, 2017, 2016, 2015, 2018], 'points': [2.9, 3.7, 4.6, 5.4, 6.9]}
This is now DataFrame based on data
       year     names  points penalty
one    2018  sunyoung     2.9     NaN
two    2017  sunyoung     3.7     NaN
three  2016  sunyoung     4.6     NaN
four   2015   jungjae     5.4     NaN
five   2018   jungjae     6.9     NaN


In [11]:
# Selecting Column
df['year']

one      2018
two      2017
three    2016
four     2015
five     2018
Name: year, dtype: int64

In [13]:
# Selecting certain column
df.names

one      sunyoung
two      sunyoung
three    sunyoung
four      jungjae
five      jungjae
Name: names, dtype: object

In [14]:
# Selecting multiple columns
df[['names','year']]

Unnamed: 0,names,year
one,sunyoung,2018
two,sunyoung,2017
three,sunyoung,2016
four,jungjae,2015
five,jungjae,2018


In [15]:
# Making new column
df['penalty'] = 0.2

In [16]:
df

Unnamed: 0,year,names,points,penalty
one,2018,sunyoung,2.9,0.2
two,2017,sunyoung,3.7,0.2
three,2016,sunyoung,4.6,0.2
four,2015,jungjae,5.4,0.2
five,2018,jungjae,6.9,0.2


In [17]:
# Making new column by list
df['penalty'] = [0.1,0.2,0.3,0.4,0.5]

In [18]:
df

Unnamed: 0,year,names,points,penalty
one,2018,sunyoung,2.9,0.1
two,2017,sunyoung,3.7,0.2
three,2016,sunyoung,4.6,0.3
four,2015,jungjae,5.4,0.4
five,2018,jungjae,6.9,0.5


In [19]:
df['rankl'] = np.arange(5)

In [21]:
df

Unnamed: 0,year,names,points,penalty,rankl
one,2018,sunyoung,2.9,0.1,0
two,2017,sunyoung,3.7,0.2,1
three,2016,sunyoung,4.6,0.3,2
four,2015,jungjae,5.4,0.4,3
five,2018,jungjae,6.9,0.5,4


In [26]:
df.rankl

one      0
two      1
three    2
four     3
five     4
Name: rankl, dtype: int32

In [31]:
# Erroneous way to delete column
del df.rankl
# del df['year','points']
# del df[1:3]

AttributeError: rankl

In [29]:
# Deleting by using del keyword
del df['rankl']

In [30]:
df

Unnamed: 0,year,names,points,penalty
one,2018,sunyoung,2.9,0.1
two,2017,sunyoung,3.7,0.2
three,2016,sunyoung,4.6,0.3
four,2015,jungjae,5.4,0.4
five,2018,jungjae,6.9,0.5


In [32]:
# Inserting new data
val = pd.Series([-1.2,-1.3,-1.4], index=['two','four','five'])
df['debt'] = val

In [33]:
df #values not exisiting filled with NaN

Unnamed: 0,year,names,points,penalty,debt
one,2018,sunyoung,2.9,0.1,
two,2017,sunyoung,3.7,0.2,-1.2
three,2016,sunyoung,4.6,0.3,
four,2015,jungjae,5.4,0.4,-1.3
five,2018,jungjae,6.9,0.5,-1.4


In [34]:
# Making new columns with existing column
df['net_point'] = df['points'] - df['penalty']

In [35]:
df

Unnamed: 0,year,names,points,penalty,debt,net_point
one,2018,sunyoung,2.9,0.1,,2.8
two,2017,sunyoung,3.7,0.2,-1.2,3.5
three,2016,sunyoung,4.6,0.3,,4.3
four,2015,jungjae,5.4,0.4,-1.3,5.0
five,2018,jungjae,6.9,0.5,-1.4,6.4


In [37]:
df['high_point'] = df['net_point'] > 3.0

In [38]:
df

Unnamed: 0,year,names,points,penalty,debt,net_point,high_point
one,2018,sunyoung,2.9,0.1,,2.8,False
two,2017,sunyoung,3.7,0.2,-1.2,3.5,True
three,2016,sunyoung,4.6,0.3,,4.3,True
four,2015,jungjae,5.4,0.4,-1.3,5.0,True
five,2018,jungjae,6.9,0.5,-1.4,6.4,True


In [39]:
del df['high_point']
del df['net_point']

In [40]:
df

Unnamed: 0,year,names,points,penalty,debt
one,2018,sunyoung,2.9,0.1,
two,2017,sunyoung,3.7,0.2,-1.2
three,2016,sunyoung,4.6,0.3,
four,2015,jungjae,5.4,0.4,-1.3
five,2018,jungjae,6.9,0.5,-1.4


In [41]:
df.columns

Index(['year', 'names', 'points', 'penalty', 'debt'], dtype='object')

In [44]:
df.index.name="Order"

In [45]:
df.columns.name = "Info"

In [46]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018,sunyoung,2.9,0.1,
two,2017,sunyoung,3.7,0.2,-1.2
three,2016,sunyoung,4.6,0.3,
four,2015,jungjae,5.4,0.4,-1.3
five,2018,jungjae,6.9,0.5,-1.4


In [47]:
# Erronous way to indexing
df[0]

KeyError: 0

In [48]:
# But indexing multiple rows is available using colon (:)... thus not recommended
df[0:3]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018,sunyoung,2.9,0.1,
two,2017,sunyoung,3.7,0.2,-1.2
three,2016,sunyoung,4.6,0.3,


In [50]:
# Indexing using loc, iloc (for integer indexing)
df.loc['two']

Info
year           2017
names      sunyoung
points          3.7
penalty         0.2
debt           -1.2
Name: two, dtype: object

In [51]:
df.loc['two':'four']

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
two,2017,sunyoung,3.7,0.2,-1.2
three,2016,sunyoung,4.6,0.3,
four,2015,jungjae,5.4,0.4,-1.3


In [52]:
df.loc['two':'four', 'points']

Order
two      3.7
three    4.6
four     5.4
Name: points, dtype: float64

In [53]:
df.loc[:,'year']

Order
one      2018
two      2017
three    2016
four     2015
five     2018
Name: year, dtype: int64

In [54]:
df.loc[:,['year','names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,2018,sunyoung
two,2017,sunyoung
three,2016,sunyoung
four,2015,jungjae
five,2018,jungjae


In [55]:
# Indexing by its name includes the specified 'end' (not like number indexing)
df.loc['two':'four',['year','names']]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
two,2017,sunyoung
three,2016,sunyoung
four,2015,jungjae


In [56]:
df.loc['two':'four','year':'names']

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
two,2017,sunyoung
three,2016,sunyoung
four,2015,jungjae


In [58]:
df.loc['six',:] = [2019,'psy',9.9, 10, 9.8]

In [59]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018.0,sunyoung,2.9,0.1,
two,2017.0,sunyoung,3.7,0.2,-1.2
three,2016.0,sunyoung,4.6,0.3,
four,2015.0,jungjae,5.4,0.4,-1.3
five,2018.0,jungjae,6.9,0.5,-1.4
six,2019.0,psy,9.9,10.0,9.8


In [60]:
# Indexing with iloc
df.iloc[3]

Info
year          2015
names      jungjae
points         5.4
penalty        0.4
debt          -1.3
Name: four, dtype: object

In [61]:
# Indexing with integer does not include last index
df.iloc[3:5]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
four,2015.0,jungjae,5.4,0.4,-1.3
five,2018.0,jungjae,6.9,0.5,-1.4


In [62]:
df.iloc[3:5,0:2]

Info,year,names
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
four,2015.0,jungjae
five,2018.0,jungjae


In [63]:
df.iloc[[1,3,5],[1,2]]

Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
two,sunyoung,3.7
four,jungjae,5.4
six,psy,9.9


In [64]:
df.iloc[:,1:4]

Info,names,points,penalty
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,sunyoung,2.9,0.1
two,sunyoung,3.7,0.2
three,sunyoung,4.6,0.3
four,jungjae,5.4,0.4
five,jungjae,6.9,0.5
six,psy,9.9,10.0


In [65]:
df.iloc[1,1]

'sunyoung'

In [66]:
df['year'] > 2017

Order
one       True
two      False
three    False
four     False
five      True
six       True
Name: year, dtype: bool

In [67]:
# Indexing with boolean
df.loc[df['year'] > 2017, :]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018.0,sunyoung,2.9,0.1,
five,2018.0,jungjae,6.9,0.5,-1.4
six,2019.0,psy,9.9,10.0,9.8


In [70]:
print(df)
df.loc[df['names']=='sunyoung', ['names','points']]

Info     year     names  points  penalty  debt
Order                                         
one    2018.0  sunyoung     2.9      0.1   NaN
two    2017.0  sunyoung     3.7      0.2  -1.2
three  2016.0  sunyoung     4.6      0.3   NaN
four   2015.0   jungjae     5.4      0.4  -1.3
five   2018.0   jungjae     6.9      0.5  -1.4
six    2019.0       psy     9.9     10.0   9.8


Info,names,points
Order,Unnamed: 1_level_1,Unnamed: 2_level_1
one,sunyoung,2.9
two,sunyoung,3.7
three,sunyoung,4.6


In [72]:
df.loc[(df['points'] > 2) & (df['points'] <3),:]

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018.0,sunyoung,2.9,0.1,


In [73]:
df.loc[df['points']>5, 'penalty'] = 0

In [74]:
df

Info,year,names,points,penalty,debt
Order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
one,2018.0,sunyoung,2.9,0.1,
two,2017.0,sunyoung,3.7,0.2,-1.2
three,2016.0,sunyoung,4.6,0.3,
four,2015.0,jungjae,5.4,0.0,-1.3
five,2018.0,jungjae,6.9,0.0,-1.4
six,2019.0,psy,9.9,0.0,9.8


## Another Pandas practice

In [75]:
df = pd.DataFrame(np.random.randn(5,4))
df.columns = ["A", "B", "C", "D"]

In [77]:
df

Unnamed: 0,A,B,C,D
0,-0.349752,0.235295,-0.127951,-0.719857
1,-1.460104,0.874766,-0.108095,-0.829834
2,0.666836,0.00719,0.260138,-1.361385
3,0.007669,0.584588,-0.161515,0.857565
4,0.00486,-0.360874,3.030764,-0.621454


In [78]:
df.index

RangeIndex(start=0, stop=5, step=1)

In [80]:
df.index = pd.date_range('20180214',periods=5)

In [81]:
df

Unnamed: 0,A,B,C,D
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834
2018-02-16,0.666836,0.00719,0.260138,-1.361385
2018-02-17,0.007669,0.584588,-0.161515,0.857565
2018-02-18,0.00486,-0.360874,3.030764,-0.621454


In [82]:
df.index

DatetimeIndex(['2018-02-14', '2018-02-15', '2018-02-16', '2018-02-17',
               '2018-02-18'],
              dtype='datetime64[ns]', freq='D')

In [83]:
df["E"] = [10,np.nan, 9.9, 9.8, np.nan]

In [84]:
df

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,


In [85]:
df.dropna(how='any') # if there is na in any colum, delete row
# df.dropna(how='any', inplace=True) # apply (commit) to data immediately

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8


In [86]:
df

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,


In [87]:
df.dropna(how='all') # if there is na in all column, delete row

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,


In [88]:
df.fillna(value=9.9) # Replace NaN to specified 'value'

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,9.9
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,9.9


In [89]:
df

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,


In [90]:
df.fillna(value = 9.9, inplace = True)

In [91]:
df

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,9.9
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,9.9


In [92]:
df.isnull() # Checking whether there is any NaN

Unnamed: 0,A,B,C,D,E
2018-02-14,False,False,False,False,False
2018-02-15,False,False,False,False,False
2018-02-16,False,False,False,False,False
2018-02-17,False,False,False,False,False
2018-02-18,False,False,False,False,False


In [98]:
pd.to_datetime('20180214235959')

pandas._libs.tslib.Timestamp

In [95]:
df.index

DatetimeIndex(['2018-02-14', '2018-02-15', '2018-02-16', '2018-02-17',
               '2018-02-18'],
              dtype='datetime64[ns]', freq='D')

In [99]:
df.drop(pd.to_datetime('20180215'))

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,9.9


In [100]:
df.drop([pd.to_datetime('20180215'), pd.to_datetime('20180217')])

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,9.9


In [101]:
# Does not allow deleting index range
df.drop(pd.to_datetime('20180215'):pd.to_datetime('20180217'))

SyntaxError: invalid syntax (<ipython-input-101-fa9bf4bdde8f>, line 1)

In [102]:
df.drop('E',axis=1)

Unnamed: 0,A,B,C,D
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834
2018-02-16,0.666836,0.00719,0.260138,-1.361385
2018-02-17,0.007669,0.584588,-0.161515,0.857565
2018-02-18,0.00486,-0.360874,3.030764,-0.621454


In [104]:
df.drop(['B','D'], axis=1)

Unnamed: 0,A,C,E
2018-02-14,-0.349752,-0.127951,10.0
2018-02-15,-1.460104,-0.108095,9.9
2018-02-16,0.666836,0.260138,9.9
2018-02-17,0.007669,-0.161515,9.8
2018-02-18,0.00486,3.030764,9.9


In [106]:
# Erronous way to use drop()
df.drop('B':'D', axis=1)
df.drop(1, axis =1)

ValueError: labels [1] not contained in axis

In [123]:
# Possible, but can output unexpected
print(df.drop(df.iloc[:,1:3], axis = 1))
print('\n')
print(df.drop(df.iloc[1:3], axis = 1))

                   A         D     E
2018-02-14 -0.349752 -0.719857  10.0
2018-02-15 -1.460104 -0.829834   9.9
2018-02-16  0.666836 -1.361385   9.9
2018-02-17  0.007669  0.857565   9.8
2018-02-18  0.004860 -0.621454   9.9


Empty DataFrame
Columns: []
Index: [2018-02-14 00:00:00, 2018-02-15 00:00:00, 2018-02-16 00:00:00, 2018-02-17 00:00:00, 2018-02-18 00:00:00]


In [113]:
df.iloc[1:3]

Unnamed: 0,A,B,C,D,E
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,9.9
2018-02-16,0.666836,0.00719,0.260138,-1.361385,9.9


In [114]:
df.drop(df.index[[2,4]])

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,9.9
2018-02-17,0.007669,0.584588,-0.161515,0.857565,9.8


In [119]:
df.drop(df.index[2:4])

Unnamed: 0,A,B,C,D,E
2018-02-14,-0.349752,0.235295,-0.127951,-0.719857,10.0
2018-02-15,-1.460104,0.874766,-0.108095,-0.829834,9.9
2018-02-18,0.00486,-0.360874,3.030764,-0.621454,9.9


## Pandas practice for statistical method

In [3]:
data = [[9.9, 8.8], [np.nan, 6.6],[7.7, np.nan],  [0.99, 9.8]]
df = pd.DataFrame(data, columns=["first", "second"], index=["a", "b", "c", "d"])

In [4]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [5]:
df.sum(axis=0)

first     18.59
second    25.20
dtype: float64

In [6]:
df.sum(axis=1)

a    18.70
b     6.60
c     7.70
d    10.79
dtype: float64

In [7]:
df['first'].sum()

18.59

In [9]:
df.loc['a'].sum()

18.700000000000003

In [10]:
df.mean?

In [11]:
df.mean(axis=1, skipna=False)

a    9.350
b      NaN
c      NaN
d    5.395
dtype: float64

In [12]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,,6.6
c,7.7,
d,0.99,9.8


In [13]:
first_mean = df.mean(axis=0)['first']

In [14]:
second_min = df.min(axis=0)['second']

In [15]:
print(first_mean)
print(second_min)

6.19666666667
6.6


In [16]:
df['first'] = df['first'].fillna(value=first_mean)

In [17]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,
d,0.99,9.8


In [19]:
df['second'] = df['second'].fillna(value=second_min)

In [20]:
df

Unnamed: 0,first,second
a,9.9,8.8
b,6.196667,6.6
c,7.7,6.6
d,0.99,9.8


In [34]:
# Using date_range as index
df2 = pd.DataFrame(np.random.randn(6, 4),
                   columns=["A", "B", "C", "D"],
                   index=pd.date_range("20180220", periods=6))

In [22]:
df2

Unnamed: 0,A,B,C,D
2018-02-20,-0.168376,-0.463294,-0.63766,0.159245
2018-02-21,-0.173026,1.423109,-0.913551,-0.958292
2018-02-22,-0.538118,0.159989,0.137701,0.168037
2018-02-23,-1.350316,-1.978794,-0.836944,0.265519
2018-02-24,0.649279,1.075311,1.187389,0.741568
2018-02-25,0.286663,3.001361,-0.409323,-0.098799


In [23]:
df2['A'].corr(df2['B'])

0.78907491735310586

In [25]:
df2['A'].corr(df2['C'])

0.60081782656755867

In [26]:
df2['A'].corr(df2['D'])

0.10402766434905919

In [27]:
df2.corr()

Unnamed: 0,A,B,C,D
A,1.0,0.789075,0.600818,0.104028
B,0.789075,1.0,0.221778,-0.328582
C,0.600818,0.221778,1.0,0.695982
D,0.104028,-0.328582,0.695982,1.0


In [29]:
df2['A'].cov(df2['B'])

0.93402465207523455

In [30]:
df2.cov()

Unnamed: 0,A,B,C,D
A,0.479185,0.934025,0.331513,0.040599
B,0.934025,2.923997,0.302282,-0.316773
C,0.331513,0.302282,0.635348,0.312767
D,0.040599,-0.316773,0.312767,0.317858


## Sort practice

In [35]:
df2

Unnamed: 0,A,B,C,D
2018-02-20,-0.211988,-1.522578,0.52047,-1.061424
2018-02-21,0.469553,0.245697,0.5772,1.031103
2018-02-22,-0.770935,-0.536438,-0.766135,-0.424059
2018-02-23,-0.889999,1.378636,0.622979,0.838524
2018-02-24,-0.221713,-1.080251,0.748889,2.244788
2018-02-25,1.385897,-1.177887,-1.027915,-0.576146


In [36]:
dates = df2.index
# permutation 은 치환
random_dates = np.random.permutation(dates)
# 무작위로 섞어봄. index 순서와 컬럼의 순서가 불규칙하게 변함
df2 = df2.reindex(index=random_dates, columns=["D", "B", "C", "A"])

In [37]:
df2

Unnamed: 0,D,B,C,A
2018-02-25,-0.576146,-1.177887,-1.027915,1.385897
2018-02-22,-0.424059,-0.536438,-0.766135,-0.770935
2018-02-24,2.244788,-1.080251,0.748889,-0.221713
2018-02-20,-1.061424,-1.522578,0.52047,-0.211988
2018-02-23,0.838524,1.378636,0.622979,-0.889999
2018-02-21,1.031103,0.245697,0.5772,0.469553


In [39]:
df2.sort_index(axis = 0)

Unnamed: 0,D,B,C,A
2018-02-20,-1.061424,-1.522578,0.52047,-0.211988
2018-02-21,1.031103,0.245697,0.5772,0.469553
2018-02-22,-0.424059,-0.536438,-0.766135,-0.770935
2018-02-23,0.838524,1.378636,0.622979,-0.889999
2018-02-24,2.244788,-1.080251,0.748889,-0.221713
2018-02-25,-0.576146,-1.177887,-1.027915,1.385897


In [40]:
df2.sort_index(axis=1)

Unnamed: 0,A,B,C,D
2018-02-25,1.385897,-1.177887,-1.027915,-0.576146
2018-02-22,-0.770935,-0.536438,-0.766135,-0.424059
2018-02-24,-0.221713,-1.080251,0.748889,2.244788
2018-02-20,-0.211988,-1.522578,0.52047,-1.061424
2018-02-23,-0.889999,1.378636,0.622979,0.838524
2018-02-21,0.469553,0.245697,0.5772,1.031103


In [41]:
df2.sort_index(axis=0).sort_index(axis=1)

Unnamed: 0,A,B,C,D
2018-02-20,-0.211988,-1.522578,0.52047,-1.061424
2018-02-21,0.469553,0.245697,0.5772,1.031103
2018-02-22,-0.770935,-0.536438,-0.766135,-0.424059
2018-02-23,-0.889999,1.378636,0.622979,0.838524
2018-02-24,-0.221713,-1.080251,0.748889,2.244788
2018-02-25,1.385897,-1.177887,-1.027915,-0.576146


In [42]:
df2.sort_index?

In [44]:
df2.sort_index(axis=0,ascending=False)

Unnamed: 0,D,B,C,A
2018-02-25,-0.576146,-1.177887,-1.027915,1.385897
2018-02-24,2.244788,-1.080251,0.748889,-0.221713
2018-02-23,0.838524,1.378636,0.622979,-0.889999
2018-02-22,-0.424059,-0.536438,-0.766135,-0.770935
2018-02-21,1.031103,0.245697,0.5772,0.469553
2018-02-20,-1.061424,-1.522578,0.52047,-0.211988


In [45]:
df2.sort_index(axis=1,ascending=False)

Unnamed: 0,D,C,B,A
2018-02-25,-0.576146,-1.027915,-1.177887,1.385897
2018-02-22,-0.424059,-0.766135,-0.536438,-0.770935
2018-02-24,2.244788,0.748889,-1.080251,-0.221713
2018-02-20,-1.061424,0.52047,-1.522578,-0.211988
2018-02-23,0.838524,0.622979,1.378636,-0.889999
2018-02-21,1.031103,0.5772,0.245697,0.469553


In [46]:
df2.sort_values(by='D')

Unnamed: 0,D,B,C,A
2018-02-20,-1.061424,-1.522578,0.52047,-0.211988
2018-02-25,-0.576146,-1.177887,-1.027915,1.385897
2018-02-22,-0.424059,-0.536438,-0.766135,-0.770935
2018-02-23,0.838524,1.378636,0.622979,-0.889999
2018-02-21,1.031103,0.245697,0.5772,0.469553
2018-02-24,2.244788,-1.080251,0.748889,-0.221713


In [47]:
df2.sort_values(by='D',ascending=False)

Unnamed: 0,D,B,C,A
2018-02-24,2.244788,-1.080251,0.748889,-0.221713
2018-02-21,1.031103,0.245697,0.5772,0.469553
2018-02-23,0.838524,1.378636,0.622979,-0.889999
2018-02-22,-0.424059,-0.536438,-0.766135,-0.770935
2018-02-25,-0.576146,-1.177887,-1.027915,1.385897
2018-02-20,-1.061424,-1.522578,0.52047,-0.211988


In [None]:
# 열 추가
df2["E"] = np.random.randint(0, 6, size=6)
df2["F"] = ["first", "second", "first", "third", "first", "second"]

df3 = pd.DataFrame(np.random.randn(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])

df4 = pd.DataFrame(np.random.rand(4, 3), columns=["b", "d", "e"],
                   index=["Seoul", "Incheon", "Busan", "Daegu"])

