In [1]:
import numpy as np
import pandas as pd

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
# create dataset using numpy
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-03,0.317707,1.413521,-1.81006,-1.178618
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-06,2.165504,-0.712453,-0.375344,0.203907


In [6]:
df2 = pd.DataFrame(
  {
    "A": 1.0,
    "B": pd.Timestamp("20130102"),
    "C": pd.Series(1, index=list(range(4)), dtype="float32"),
    "D": np.array([3] * 4, dtype="int32"),
    "E": pd.Categorical(["test", "train", "test", "train"]),
    "F": "foo",
  }
)

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [8]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [11]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-03,0.317707,1.413521,-1.81006,-1.178618
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-06,2.165504,-0.712453,-0.375344,0.203907


In [10]:
# convert to numpy
df.to_numpy()

array([[-1.33471023,  0.18258865, -1.04049216, -0.17435102],
       [ 0.05346267, -3.42812021,  1.24582354, -0.81489699],
       [ 0.31770662,  1.41352056, -1.81006049, -1.17861815],
       [-0.30741945,  0.83936318, -0.72307275, -1.40468431],
       [-1.92652247, -0.94337852, -1.00076054,  0.14101822],
       [ 2.16550408, -0.71245311, -0.37534404,  0.20390659]])

In [12]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.171996,-0.441413,-0.617318,-0.537938
std,1.428506,1.715599,1.028569,0.690208
min,-1.926522,-3.42812,-1.81006,-1.404684
25%,-1.077888,-0.885647,-1.030559,-1.087688
50%,-0.126978,-0.264932,-0.861917,-0.494624
75%,0.251646,0.67517,-0.462276,0.062176
max,2.165504,1.413521,1.245824,0.203907


In [13]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.33471,0.053463,0.317707,-0.307419,-1.926522,2.165504
B,0.182589,-3.42812,1.413521,0.839363,-0.943379,-0.712453
C,-1.040492,1.245824,-1.81006,-0.723073,-1.000761,-0.375344
D,-0.174351,-0.814897,-1.178618,-1.404684,0.141018,0.203907


In [14]:
# sorting
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,-0.174351,-1.040492,0.182589,-1.33471
2013-01-02,-0.814897,1.245824,-3.42812,0.053463
2013-01-03,-1.178618,-1.81006,1.413521,0.317707
2013-01-04,-1.404684,-0.723073,0.839363,-0.307419
2013-01-05,0.141018,-1.000761,-0.943379,-1.926522
2013-01-06,0.203907,-0.375344,-0.712453,2.165504


In [15]:
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,2.165504,-0.712453,-0.375344,0.203907
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-03,0.317707,1.413521,-1.81006,-1.178618
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351


In [16]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-06,2.165504,-0.712453,-0.375344,0.203907
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-03,0.317707,1.413521,-1.81006,-1.178618


In [17]:
df.sort_values(by=["B", "A"])

Unnamed: 0,A,B,C,D
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-06,2.165504,-0.712453,-0.375344,0.203907
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-03,0.317707,1.413521,-1.81006,-1.178618


## Selection

In [19]:
df

Unnamed: 0,A,B,C,D
2013-01-01,-1.33471,0.182589,-1.040492,-0.174351
2013-01-02,0.053463,-3.42812,1.245824,-0.814897
2013-01-03,0.317707,1.413521,-1.81006,-1.178618
2013-01-04,-0.307419,0.839363,-0.723073,-1.404684
2013-01-05,-1.926522,-0.943379,-1.000761,0.141018
2013-01-06,2.165504,-0.712453,-0.375344,0.203907


In [18]:
df[['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.33471,0.182589
2013-01-02,0.053463,-3.42812
2013-01-03,0.317707,1.413521
2013-01-04,-0.307419,0.839363
2013-01-05,-1.926522,-0.943379
2013-01-06,2.165504,-0.712453


In [20]:
df[2:3]

Unnamed: 0,A,B,C,D
2013-01-03,0.317707,1.413521,-1.81006,-1.178618


In [23]:
df.iloc[0:3, 0:2]

Unnamed: 0,A,B
2013-01-01,-1.33471,0.182589
2013-01-02,0.053463,-3.42812
2013-01-03,0.317707,1.413521


In [27]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.33471,0.182589
2013-01-02,0.053463,-3.42812
2013-01-03,0.317707,1.413521
2013-01-04,-0.307419,0.839363
2013-01-05,-1.926522,-0.943379
2013-01-06,2.165504,-0.712453


In [28]:
# getting 3rd row
df.iloc[3]

A   -0.307419
B    0.839363
C   -0.723073
D   -1.404684
Name: 2013-01-04 00:00:00, dtype: float64

## Data Wrangling using titanic dataset

In [29]:
import seaborn as sns

titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [30]:
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [31]:
titanic.sample(100)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
414,1,3,male,44.0,0,0,7.9250,S,Third,man,True,,Southampton,yes,True
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
791,0,2,male,16.0,0,0,26.0000,S,Second,man,True,,Southampton,no,True
660,1,1,male,50.0,2,0,133.6500,S,First,man,True,,Southampton,yes,False
847,0,3,male,35.0,0,0,7.8958,C,Third,man,True,,Cherbourg,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
270,0,1,male,,0,0,31.0000,S,First,man,True,,Southampton,no,True
464,0,3,male,,0,0,8.0500,S,Third,man,True,,Southampton,no,True
24,0,3,female,8.0,3,1,21.0750,S,Third,child,False,,Southampton,no,False
246,0,3,female,25.0,0,0,7.7750,S,Third,woman,False,,Southampton,no,True


In [32]:
# how to filter data using conditional operators
titanic[titanic['age'] < 5]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
10,1,3,female,4.0,1,1,16.7,S,Third,child,False,G,Southampton,yes,False
16,0,3,male,2.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
43,1,2,female,3.0,1,2,41.5792,C,Second,child,False,,Cherbourg,yes,False
63,0,3,male,4.0,3,2,27.9,S,Third,child,False,,Southampton,no,False
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
119,0,3,female,2.0,4,2,31.275,S,Third,child,False,,Southampton,no,False
164,0,3,male,1.0,4,1,39.6875,S,Third,child,False,,Southampton,no,False
171,0,3,male,4.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
172,1,3,female,1.0,1,1,11.1333,S,Third,child,False,,Southampton,yes,False


In [33]:
titanic[titanic['age'] < 1]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
78,1,2,male,0.83,0,2,29.0,S,Second,child,False,,Southampton,yes,False
305,1,1,male,0.92,1,2,151.55,S,First,child,False,C,Southampton,yes,False
469,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
644,1,3,female,0.75,2,1,19.2583,C,Third,child,False,,Cherbourg,yes,False
755,1,2,male,0.67,1,1,14.5,S,Second,child,False,,Southampton,yes,False
803,1,3,male,0.42,0,1,8.5167,C,Third,child,False,,Cherbourg,yes,False
831,1,2,male,0.83,1,1,18.75,S,Second,child,False,,Southampton,yes,False


In [34]:
titanic[titanic['fare'] == 0]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
263,0,1,male,40.0,0,0,0.0,S,First,man,True,B,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
277,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
413,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
466,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
481,0,2,male,,0,0,0.0,S,Second,man,True,,Southampton,no,True
597,0,3,male,49.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
633,0,1,male,,0,0,0.0,S,First,man,True,,Southampton,no,True
