Creating a **Series** by passing a list of values

In [6]:
import pandas as pd
import numpy as np

s = pd.Series([1,3,5,np.nan,6,8])

In [7]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Creating a **DataFrame** using Numpy array, with a datetime index and labeld column

In [8]:
dates = pd.date_range("20210106", periods=6)

In [9]:
dates

DatetimeIndex(['2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09',
               '2021-01-10', '2021-01-11'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df = pd.DataFrame(np.random.randn(6,4), index = dates, columns = list("ABCD"))

In [11]:
df

Unnamed: 0,A,B,C,D
2021-01-06,0.398769,-1.028387,0.821235,-0.068152
2021-01-07,-1.858927,1.407557,0.646231,0.678341
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864
2021-01-11,0.475726,1.232715,0.240063,0.20656


**DataFrame** from dict()

In [12]:
df2 = pd.DataFrame(
    {
        "A": 1.0, 
        "B": pd.Timestamp("20210305"), 
        "C": pd.Series(1, index = list(range(4)), dtype = "float32"),
        "D": np.array([3] * 4, dtype = "int32"),
        "E": pd.Categorical(["test","train"] * 2),
        "F": "foo"
    })

In [13]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2021-03-05,1.0,3,test,foo
1,1.0,2021-03-05,1.0,3,train,foo
2,1.0,2021-03-05,1.0,3,test,foo
3,1.0,2021-03-05,1.0,3,train,foo


In [14]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

### Viewing Data

In [15]:
df.head()

Unnamed: 0,A,B,C,D
2021-01-06,0.398769,-1.028387,0.821235,-0.068152
2021-01-07,-1.858927,1.407557,0.646231,0.678341
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864


In [16]:
df.tail(3)

Unnamed: 0,A,B,C,D
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864
2021-01-11,0.475726,1.232715,0.240063,0.20656


In [17]:
df.index

DatetimeIndex(['2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09',
               '2021-01-10', '2021-01-11'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
df.to_numpy()
# Gives a numpy representation of the DataFrame

array([[ 0.39876932, -1.02838707,  0.821235  , -0.06815218],
       [-1.85892746,  1.40755714,  0.64623121,  0.67834054],
       [-0.79798422, -0.78872767,  2.29036163, -0.25033172],
       [ 0.11467459, -2.72941033,  1.15209758,  0.99275649],
       [-0.32916831, -0.05805118, -0.34610258, -0.27286414],
       [ 0.47572573,  1.23271534,  0.24006322,  0.20656004]])

In [20]:
df.describe()
# Shows a quick statistic summary of your data

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.332818,-0.327384,0.800648,0.214385
std,0.887087,1.548576,0.894016,0.520457
min,-1.858927,-2.72941,-0.346103,-0.272864
25%,-0.68078,-0.968472,0.341605,-0.204787
50%,-0.107247,-0.423389,0.733733,0.069204
75%,0.327746,0.910024,1.069382,0.560395
max,0.475726,1.407557,2.290362,0.992756


In [21]:
df.T
# Transposing your data using T

Unnamed: 0,2021-01-06,2021-01-07,2021-01-08,2021-01-09,2021-01-10,2021-01-11
A,0.398769,-1.858927,-0.797984,0.114675,-0.329168,0.475726
B,-1.028387,1.407557,-0.788728,-2.72941,-0.058051,1.232715
C,0.821235,0.646231,2.290362,1.152098,-0.346103,0.240063
D,-0.068152,0.678341,-0.250332,0.992756,-0.272864,0.20656


In [22]:
df.sort_index(axis=1,ascending=False)
# sorting based on axis 1

Unnamed: 0,D,C,B,A
2021-01-06,-0.068152,0.821235,-1.028387,0.398769
2021-01-07,0.678341,0.646231,1.407557,-1.858927
2021-01-08,-0.250332,2.290362,-0.788728,-0.797984
2021-01-09,0.992756,1.152098,-2.72941,0.114675
2021-01-10,-0.272864,-0.346103,-0.058051,-0.329168
2021-01-11,0.20656,0.240063,1.232715,0.475726


In [23]:
df.sort_values(by="C")

Unnamed: 0,A,B,C,D
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864
2021-01-11,0.475726,1.232715,0.240063,0.20656
2021-01-07,-1.858927,1.407557,0.646231,0.678341
2021-01-06,0.398769,-1.028387,0.821235,-0.068152
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332


#### Getting Data

In [24]:
df["A"]
# Getting a single column values

2021-01-06    0.398769
2021-01-07   -1.858927
2021-01-08   -0.797984
2021-01-09    0.114675
2021-01-10   -0.329168
2021-01-11    0.475726
Freq: D, Name: A, dtype: float64

In [25]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-01-06,0.398769,-1.028387,0.821235,-0.068152
2021-01-07,-1.858927,1.407557,0.646231,0.678341
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332


In [26]:
df["20210108":]

Unnamed: 0,A,B,C,D
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864
2021-01-11,0.475726,1.232715,0.240063,0.20656


#### Selection by label

In [27]:
df.loc[dates[2]]

A   -0.797984
B   -0.788728
C    2.290362
D   -0.250332
Name: 2021-01-08 00:00:00, dtype: float64

In [28]:
df.loc[dates[1],"B"]
# To fetch a scalar value of DataFrame

1.4075571402761193

#### Selection by Position

* row and column

In [29]:
df.iloc[2]

A   -0.797984
B   -0.788728
C    2.290362
D   -0.250332
Name: 2021-01-08 00:00:00, dtype: float64

In [30]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2021-01-06,-1.028387,0.821235
2021-01-07,1.407557,0.646231
2021-01-08,-0.788728,2.290362
2021-01-09,-2.72941,1.152098
2021-01-10,-0.058051,-0.346103
2021-01-11,1.232715,0.240063


#### Boolean Indexing

In [31]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D
2021-01-06,0.398769,-1.028387,0.821235,-0.068152
2021-01-09,0.114675,-2.72941,1.152098,0.992756
2021-01-11,0.475726,1.232715,0.240063,0.20656


In [32]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-01-06,0.398769,,0.821235,
2021-01-07,,1.407557,0.646231,0.678341
2021-01-08,,,2.290362,
2021-01-09,0.114675,,1.152098,0.992756
2021-01-10,,,,
2021-01-11,0.475726,1.232715,0.240063,0.20656


In [33]:
df2 = df.copy()

In [34]:
df2["E"] = ["one","two","three","four","five","six"]

In [35]:
df2

Unnamed: 0,A,B,C,D,E
2021-01-06,0.398769,-1.028387,0.821235,-0.068152,one
2021-01-07,-1.858927,1.407557,0.646231,0.678341,two
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332,three
2021-01-09,0.114675,-2.72941,1.152098,0.992756,four
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864,five
2021-01-11,0.475726,1.232715,0.240063,0.20656,six


In [36]:
df2[df2["E"].isin(["three","five"])]

Unnamed: 0,A,B,C,D,E
2021-01-08,-0.797984,-0.788728,2.290362,-0.250332,three
2021-01-10,-0.329168,-0.058051,-0.346103,-0.272864,five


#### Setting

In [37]:
s1 = pd.Series([1,2,3,4,5,6], index = pd.date_range("20210106", periods=6))

In [38]:
s1

2021-01-06    1
2021-01-07    2
2021-01-08    3
2021-01-09    4
2021-01-10    5
2021-01-11    6
Freq: D, dtype: int64

In [39]:
df["F"] = s1

In [40]:
df.at[dates[0],"A"] = 2

In [41]:
df.iat[0,1] = 0

In [42]:
df.loc[:,"D"] = np.array([5] * len(df))

In [43]:
df

Unnamed: 0,A,B,C,D,F
2021-01-06,2.0,0.0,0.821235,5,1
2021-01-07,-1.858927,1.407557,0.646231,5,2
2021-01-08,-0.797984,-0.788728,2.290362,5,3
2021-01-09,0.114675,-2.72941,1.152098,5,4
2021-01-10,-0.329168,-0.058051,-0.346103,5,5
2021-01-11,0.475726,1.232715,0.240063,5,6


In [44]:
df2 = df.copy()

In [45]:
df2[df2 > 0] = -df2

In [46]:
df2

Unnamed: 0,A,B,C,D,F
2021-01-06,-2.0,0.0,-0.821235,-5,-1
2021-01-07,-1.858927,-1.407557,-0.646231,-5,-2
2021-01-08,-0.797984,-0.788728,-2.290362,-5,-3
2021-01-09,-0.114675,-2.72941,-1.152098,-5,-4
2021-01-10,-0.329168,-0.058051,-0.346103,-5,-5
2021-01-11,-0.475726,-1.232715,-0.240063,-5,-6
