In [1]:
import pandas as pd
import numpy as np

## Setting

In [2]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [3]:
dataframe = pd.DataFrame(np.random.randn(6,4), index = dates, columns= ["First","Second","Third", "Fourth"])
dataframe

Unnamed: 0,First,Second,Third,Fourth
2013-01-01,-0.70422,-1.144567,0.149726,1.647093
2013-01-02,-1.196463,-0.617337,0.182617,0.227575
2013-01-03,2.260746,-1.718749,2.137634,0.434502
2013-01-04,-0.675446,1.683793,1.028951,-1.03016
2013-01-05,0.913936,-0.301017,-0.826613,1.05408
2013-01-06,0.303504,0.099094,-1.340719,-1.458552


In [10]:
# Setting a new column automatically aligns the data by the indexes
s1 = pd.Series([1,2,3,4,5,6])
s1

0    1
1    2
2    3
3    4
4    5
5    6
dtype: int64

In [14]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [17]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [18]:
dataframe["Fifth"] = s1
dataframe

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-01,0.0,0.0,0.149726,1.647093,
2013-01-02,-1.196463,-0.617337,0.182617,0.227575,1.0
2013-01-03,2.260746,-1.718749,2.137634,0.434502,2.0
2013-01-04,-0.675446,1.683793,1.028951,-1.03016,3.0
2013-01-05,0.913936,-0.301017,-0.826613,1.05408,4.0
2013-01-06,0.303504,0.099094,-1.340719,-1.458552,5.0


In [19]:
#Setting values by label
dataframe.at[dates[0],"First"] = 0

In [20]:
#Setting values by position
dataframe.iat[0,1] = 0

In [21]:
# Setting by assigning with a numpy array
dataframe.loc[:,"Fourth"] = np.array([5]* len(dataframe))

In [22]:
dataframe

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-01,0.0,0.0,0.149726,5,
2013-01-02,-1.196463,-0.617337,0.182617,5,1.0
2013-01-03,2.260746,-1.718749,2.137634,5,2.0
2013-01-04,-0.675446,1.683793,1.028951,5,3.0
2013-01-05,0.913936,-0.301017,-0.826613,5,4.0
2013-01-06,0.303504,0.099094,-1.340719,5,5.0


In [31]:
# A where operation with setting
dataframe2 = dataframe.copy()

In [33]:
dataframe2[dataframe2 > 0] = -dataframe2
dataframe2

Unnamed: 0,First,Second,Third,Fourth,Fifth
1,0.0,0.0,-0.142361,-5,-2
2,-0.632863,-0.645186,-1.670928,-5,-3
3,-1.121981,-1.741132,-0.931055,-5,-4
4,-0.894768,-1.359805,-0.405814,-5,-5
5,-0.09658,-1.957094,-0.621693,-5,-6
6,-0.055922,-0.797635,-0.299835,-5,-7


## Missing Data

In [25]:
# Reindexing allows you to change/add/delete the index on a specified axis. This returns a copy of the data.
dataframe1 = dataframe.reindex(index = dates[0:4], columns=list(dataframe.columns) + ["E"])
dataframe1.loc[dates[0]:dates[1],"E"] = 1
dataframe1

Unnamed: 0,First,Second,Third,Fourth,Fifth,E
2013-01-01,0.0,0.0,0.149726,5,,1.0
2013-01-02,-1.196463,-0.617337,0.182617,5,1.0,1.0
2013-01-03,2.260746,-1.718749,2.137634,5,2.0,
2013-01-04,-0.675446,1.683793,1.028951,5,3.0,


In [28]:
# To drop  any rows that have missing data
dataframe1.dropna(how = "any")

Unnamed: 0,First,Second,Third,Fourth,Fifth,E
2013-01-02,-1.196463,-0.617337,0.182617,5,1.0,1.0


In [29]:
#Filling missing data
dataframe1.fillna(value = 5)

Unnamed: 0,First,Second,Third,Fourth,Fifth,E
2013-01-01,0.0,0.0,0.149726,5,5.0,1.0
2013-01-02,-1.196463,-0.617337,0.182617,5,1.0,1.0
2013-01-03,2.260746,-1.718749,2.137634,5,2.0,5.0
2013-01-04,-0.675446,1.683793,1.028951,5,3.0,5.0


In [30]:
# To get the boolen mask where values are nan
pd.isnull(dataframe)

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-01,False,False,False,False,True
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,False
2013-01-04,False,False,False,False,False
2013-01-05,False,False,False,False,False
2013-01-06,False,False,False,False,False


## Operations

In [32]:
#Performing a descriptive statistic
dataframe.mean()

First     0.267713
Second   -0.142369
Third     0.221933
Fourth    5.000000
Fifth     3.000000
dtype: float64

In [34]:
# Same operation on the other axis
dataframe.mean(1)

2013-01-01    1.287431
2013-01-02    0.873763
2013-01-03    1.935926
2013-01-04    2.007460
2013-01-05    1.757261
2013-01-06    1.812376
Freq: D, dtype: float64

In [36]:
dataframe.describe()

Unnamed: 0,First,Second,Third,Fourth,Fifth
count,6.0,6.0,6.0,6.0,5.0
mean,0.267713,-0.142369,0.221933,5.0,3.0
std,1.224426,1.10898,1.254534,0.0,1.581139
min,-1.196463,-1.718749,-1.340719,5.0,1.0
25%,-0.506584,-0.538257,-0.582528,5.0,2.0
50%,0.151752,-0.150508,0.166171,5.0,3.0
75%,0.761328,0.074321,0.817367,5.0,4.0
max,2.260746,1.683793,2.137634,5.0,5.0


In [37]:
# Apply
# Appliying functions to the data 
dataframe.apply(np.cumsum)

Unnamed: 0,First,Second,Third,Fourth,Fifth
2013-01-01,0.0,0.0,0.149726,5,
2013-01-02,-1.196463,-0.617337,0.332342,10,1.0
2013-01-03,1.064283,-2.336086,2.469976,15,3.0
2013-01-04,0.388837,-0.652292,3.498927,20,6.0
2013-01-05,1.302773,-0.953309,2.672314,25,10.0
2013-01-06,1.606277,-0.854215,1.331595,30,15.0


In [40]:
dataframe.apply(lambda x: x.max() - x.min())

First     3.457209
Second    3.402542
Third     3.478353
Fourth    0.000000
Fifth     4.000000
dtype: float64

In [42]:
# Histogramming 
s = pd.Series(np.random.randint(0, 7, size = 10))
s

0    3
1    1
2    6
3    2
4    3
5    4
6    2
7    0
8    5
9    5
dtype: int32

In [43]:
s.value_counts()

5    2
3    2
2    2
6    1
4    1
1    1
0    1
dtype: int64

In [44]:
# String Methods

In [46]:
s = pd.Series(["A","B","C","Aaba","Baca",np.nan,"CABA","dog","cat"])
s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [47]:
s.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object