# Import pandas

In [1]:
import numpy as np
import pandas as pd

### Check version

In [2]:
print(pd.__version__)

2.0.3


# Pandas Series

- 1 dimensional data can be stored

- all elements of series have same data type

- one series is treated as a single column data

- series can have a name (just like column name)

- series can have index

## Creating Series

- with list : every element in list is one element in series


- with dictionary : key goes as index of series and value goes as element

### Create Series using list / tuple

In [None]:

# here default integer index is created
s1 = pd.Series([1, 3, 5, 6, 8])# check daa type of s1 : int
s2 =pd.Series([1, 3, 5, np.nan, 6, 8])# check daa type of s2 : float bcz of np.nan

In [None]:
s1

0    1
1    3
2    5
3    6
4    8
dtype: int64

In [None]:
s2

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [None]:
#all elements in a series can have
# ONLY single data type
s1 = pd.Series((1, 'IACSD', 5, np.nan, 6, 8))

s1

0        1
1    IACSD
2        5
3      NaN
4        6
5        8
dtype: object

###Create Series using dictionary

In [None]:
d= {1:'A',2:'B',3:'C',4:'D'}
s1 = pd.Series(d)
print(s1)
print("########################")
print("Another example")
d= {'A':500,'B':600,'C':700,'D':800}
s1 = pd.Series(d)
print(s1)

1    A
2    B
3    C
4    D
dtype: object
########################
Another example
A    500
B    600
C    700
D    800
dtype: int64


# Pandas Data Frame

- used for 2D data

- any tabular data can be handled using data frame

- can store data where every column has different data types

- efficient for column wise operations

- internally python stores dataframe like a dictionary

## create data frame



### Create data frame using list of list

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
df = pd.DataFrame(data,columns=['c1','c2','c3','c4','c5'])
print(df)
print("###########")
df = pd.DataFrame(data,columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=[101,102,103,104], columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=range(12,16), columns=list('ABCDE'))
print(df)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
   c1  c2  c3  c4   c5
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
    A   B   C   D    E
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
      A   B   C   D    E
101   1   2   3   4    5
102   4   5   6  10   11
103   7   8   9  23   34
104  10  11  12  99  100
###########
     A   B   C   D    E
12   1   2   3   4    5
13   4   5   6  10   11
14   7   8   9  23   34
15  10  11  12  99  100


In [None]:
# No broad casting ... special case
data = [[1,2,3,4,5,6],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
data = [[1],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)


    0   1   2   3    4    5
0   1   2   3   4    5  6.0
1   4   5   6  10   11  NaN
2   7   8   9  23   34  NaN
3  10  11  12  99  100  NaN
###########
    0     1     2     3      4
0   1   NaN   NaN   NaN    NaN
1   4   5.0   6.0  10.0   11.0
2   7   8.0   9.0  23.0   34.0
3  10  11.0  12.0  99.0  100.0


### Create data frame using dictionary

In [None]:
# in this dictionary one key will represent one column, and value of that key will
# contain all elements of that column
d= {'c1':[1,2,3],
    'c2':[77,88,99]}
df=pd.DataFrame(d)
print(df)

   c1  c2
0   1  77
1   2  88
2   3  99


In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0, # value 1 will broadcast to all rows of this column
        "B": pd.Timestamp("20220102"), # value here will broadcast to all rows of this column
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", # value "foo" will broadcast to all rows of this column
    }
)

print(df2)
print("#####################")
#The columns of the resulting DataFrame have different dtypes:
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2022-01-02  1.0  3   test  foo
1  1.0 2022-01-02  1.0  3  train  foo
2  1.0 2022-01-02  1.0  3   test  foo
3  1.0 2022-01-02  1.0  3  train  foo
#####################
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [None]:
#Creating a DataFrame by passing a NumPy array,
#with a datetime index using date_range()
#and labeled columns:

dates = pd.date_range("20230315",
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [None]:
from numpy.random import default_rng
rng = default_rng()

In [None]:
rng.standard_normal((6, 4) )

array([[-0.47465083,  0.9672864 , -0.49939498, -0.29047011],
       [-1.62556686, -0.66790958,  0.9651814 , -1.2238059 ],
       [ 0.71703217,  0.60115885, -0.60868183, -0.60137896],
       [ 1.37780602, -0.80423413,  0.50727323, -1.90724682],
       [ 0.83458741, -1.54983807, -0.95194474,  0.27347014],
       [ 1.40025077,  1.81989089, -0.94057819, -0.42775403]])

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,-0.094508,-1.175846,0.015926,-0.955066
2023-03-16,1.37242,-0.029891,0.347578,-0.081913
2023-03-17,-0.852988,0.126199,1.236528,-1.087426
2023-03-18,-0.230927,-1.068208,-0.204153,-0.812109
2023-03-19,-1.257268,-0.524396,0.392779,1.786843
2023-03-20,-0.394269,-0.771949,1.556502,-1.363741


## Head and Tail working

- df.head() : returns first 5 rows, if n is passed then first n rows

- df.tail() : returns last 5 rows, if n is passed then first n rows

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

# head
# get first 5 rows from dataframe
#
print(df.head())
print("#####################")
#tail
# get last 5 rows
# if n is passed then last n rows
df.tail(3)
print("#####################")
#index
print("Index values are ....")
print(df.index)
print("#####################")
#columns
print("columns values are ....")
print(df.columns)

                   A         B         C         D
2023-03-15 -0.909380  0.017906  0.078936 -0.399664
2023-03-16  0.780988 -0.595466 -0.892296  0.217425
2023-03-17 -1.644519  0.438459 -1.071459  0.295585
2023-03-18 -0.392569 -0.090001  0.329544  0.787227
2023-03-19  1.248155  0.871905  0.837542 -0.522706
#####################
#####################
Index values are ....
DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')
#####################
columns values are ....
Index(['A', 'B', 'C', 'D'], dtype='object')


## Data frame to numpy array

DataFrame.to_numpy() gives a NumPy representation
of the underlying data.

DataFrame.to_numpy() does not include
the index or column labels in the output.

Note that this can be an expensive operation
when your DataFrame has columns with different data types,
which comes down to a fundamental difference between
pandas and NumPy

NumPy arrays have one dtype for the entire array,
while pandas DataFrames have one dtype per column.
When you call DataFrame.to_numpy(), pandas will find
the NumPy dtype that can hold all of the dtypes
in the DataFrame. This may end up being object,
which requires casting every value to a Python object.


For DataFrame of all floating-point values,
DataFrame.to_numpy() is fast  
Also it doesn’t require copying data

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
n_arr = df.to_numpy()
print(type(n_arr))
print(n_arr)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
<class 'numpy.ndarray'>
[[  1   2   3   4   5]
 [  4   5   6  10  11]
 [  7   8   9  23  34]
 [ 10  11  12  99 100]]


# Descriptive Statistics *describe()* function

## Series Descriptive Statistics

In [None]:
s1= pd.Series([10,20,30,20,10])
s1.describe()

count     5.0000
mean     18.0000
std       8.3666
min      10.0000
25%      10.0000
50%      20.0000
75%      20.0000
max      30.0000
dtype: float64

## Dataframe Descriptive Statistics

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
# statistic summary of data
print(df.describe())
print("#####################")
print("Transpose of DataFrame")
# Here index and column names are swapped
df.T

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
               0          1          2          3           4
count   4.000000   4.000000   4.000000   4.000000    4.000000
mean    5.500000   6.500000   7.500000  34.000000   37.500000
std     3.872983   3.872983   3.872983  44.052998   43.500958
min     1.000000   2.000000   3.000000   4.000000    5.000000
25%     3.250000   4.250000   5.250000   8.500000    9.500000
50%     5.500000   6.500000   7.500000  16.500000   22.500000
75%     7.750000   8.750000   9.750000  42.000000   50.500000
max    10.000000  11.000000  12.000000  99.000000  100.000000
#####################
Transpose of DataFrame


Unnamed: 0,0,1,2,3
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12
3,4,10,23,99
4,5,11,34,100


# Sorting



## Sort Dataframe values using given column or columns

- sort_value()

In [None]:
data = [[1,2,3,4,15],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,2]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
#or list of columns
df = df.sort_values(by="B")
print(df)
print("##############")
#or list of columns
df = df.sort_values(by=["B",'E'])
print(df)

      A   E   D   C   B
101   1   2   3   4  15
105   4   5   6  10  11
110   7   8   9  23  34
102  10  11  12  99   2
##############
      A   E   D   C   B
102  10  11  12  99   2
105   4   5   6  10  11
101   1   2   3   4  15
110   7   8   9  23  34
##############
      A   E   D   C   B
102  10  11  12  99   2
105   4   5   6  10  11
101   1   2   3   4  15
110   7   8   9  23  34


## Sort indexes by an axis

- sort_index()

- axis 0 is row direction

- axis 1 is column direction

In [None]:

data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1, ascending=False)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,4,3,2,1,0
0,5,4,3,2,1
1,11,10,6,5,4
2,34,23,9,8,7
3,100,99,12,11,10


## Sort column names of a dataframe

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1, ascending=False)

    A   E   D   C    B
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,E,D,C,B,A
0,2,3,4,5,1
1,5,6,10,11,4
2,8,9,23,34,7
3,11,12,99,100,10


## Sort the row indexes of a data frame

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=0")
df = df.sort_index(axis=0)
print(df)
print("##############")
print("Sort index on axis=0")
df =df.sort_index(axis=0, ascending=False)
print(df)

      A   E   D   C    B
101   1   2   3   4    5
105   4   5   6  10   11
110   7   8   9  23   34
102  10  11  12  99  100
##############
Sort index on axis=0
      A   E   D   C    B
101   1   2   3   4    5
102  10  11  12  99  100
105   4   5   6  10   11
110   7   8   9  23   34
##############
Sort index on axis=0
      A   E   D   C    B
110   7   8   9  23   34
105   4   5   6  10   11
102  10  11  12  99  100
101   1   2   3   4    5


# Accessing and Selecting Data from Dataframe

## Select a column

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Direct way df['A']")
print(df['A']) # note the data type
print("################")
print("loc : select column by name df.loc[:,'A']")
print(df.loc[:,'A'])
print("################")
print("iloc : select column by index df.iloc[:,0]")
print(df.iloc[:,0])

Direct way df['A']
2023-03-15    0.687803
2023-03-16    0.501972
2023-03-17   -0.290997
2023-03-18    0.044778
2023-03-19    0.374342
2023-03-20   -1.772138
Freq: D, Name: A, dtype: float64
################
loc : select column by name
2023-03-15    0.687803
2023-03-16    0.501972
2023-03-17   -0.290997
2023-03-18    0.044778
2023-03-19    0.374342
2023-03-20   -1.772138
Freq: D, Name: A, dtype: float64
################
iloc : select column by index
2023-03-15    0.687803
2023-03-16    0.501972
2023-03-17   -0.290997
2023-03-18    0.044778
2023-03-19    0.374342
2023-03-20   -1.772138
Freq: D, Name: A, dtype: float64


## select a row by index

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("loc: select a single row using label of row df.loc['2023-03-15']")
print(df.loc["2023-03-15"])
print("################")
print("iloc: select a single row using index of row df.iloc[0]")
print(df.iloc[0])

loc: select a single row using label of row df.loc['2023-03-15']
A   -0.577807
B   -1.803232
C    1.110031
D    0.215522
Name: 2023-03-15 00:00:00, dtype: float64
################
iloc: select a single row using index of row df.iloc[0]
A   -0.577807
B   -1.803232
C    1.110031
D    0.215522
Name: 2023-03-15 00:00:00, dtype: float64


# Slicing OR Finding subset of data frame OR Selecting multiple rows / columns

## Select Multiple columns

- by name (label)

- by index



In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Select columns B and C by name")
print(df[['B','C']])
print("##############")
print("loc : Select columns B and C by name")
print(df.loc[:,['B','C']])
print("##############")
print("iloc : Select columns B and C by index (end is excluded)")
print(df.iloc[:,1:3])

Select columns B and C by name
                   B         C
2023-03-15  0.570617  0.923425
2023-03-16  3.414189 -1.340646
2023-03-17 -0.010970 -0.587874
2023-03-18 -2.350145  1.433557
2023-03-19 -0.364013 -0.418431
2023-03-20 -2.313660 -0.241069
##############
loc : Select columns B and C by name
                   B         C
2023-03-15  0.570617  0.923425
2023-03-16  3.414189 -1.340646
2023-03-17 -0.010970 -0.587874
2023-03-18 -2.350145  1.433557
2023-03-19 -0.364013 -0.418431
2023-03-20 -2.313660 -0.241069
##############
iloc : Select columns B and C by index (end is excluded)
                   B         C
2023-03-15  0.570617  0.923425
2023-03-16  3.414189 -1.340646
2023-03-17 -0.010970 -0.587874
2023-03-18 -2.350145  1.433557
2023-03-19 -0.364013 -0.418431
2023-03-20 -2.313660 -0.241069


## Select rows

- by index

- by label

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Selecting by label df['20230315':'20230318'] end is included")
print(df["20230315":"20230318"])
print("##############")
print("loc: Selecting by label df.loc['20230315':'20230318', ] end is included")
print(df.loc["20230315":"20230318",])
print("##############")
print("Selecting by index df[0:3] end is excluded")
print(df[0:3])
print("##############")
print("iloc: Selecting by index df.iloc[0:3,] end is excluded")
print(df.iloc[0:3,])

Selecting by label df['20230315':'20230318'] end is included
                   A         B         C         D
2023-03-15  1.096804 -0.576849  1.113276 -2.121102
2023-03-16  1.908324  0.087038  0.668116 -1.065025
2023-03-17 -2.401700  1.653333  0.497608 -2.193042
2023-03-18 -0.238873 -1.351537 -0.501572  0.097186
##############
loc: Selecting by label df.loc['20230315':'20230318', ] end is included
                   A         B         C         D
2023-03-15  1.096804 -0.576849  1.113276 -2.121102
2023-03-16  1.908324  0.087038  0.668116 -1.065025
2023-03-17 -2.401700  1.653333  0.497608 -2.193042
2023-03-18 -0.238873 -1.351537 -0.501572  0.097186
##############
Selecting by index df[0:3] end is excluded
                   A         B         C         D
2023-03-15  1.096804 -0.576849  1.113276 -2.121102
2023-03-16  1.908324  0.087038  0.668116 -1.065025
2023-03-17 -2.401700  1.653333  0.497608 -2.193042
##############
iloc: Selecting by index df.iloc[0:3,] end is excluded
          