# Pandas

Pandas is a python library used for Data Cleaning, Data Wrangling, Data Manipilation and Organizing Data.

Pandas is built on top of Numpy Library.

In [3]:
import pandas as pd

In [4]:
import numpy as np

# Series Data

Series data means single data column.

In [5]:
S = pd.Series([23,45,67,89,97])
S

0    23
1    45
2    67
3    89
4    97
dtype: int64

In [6]:
L = [10,100,1000,10000]
S1 = pd.Series(L)
S1

0       10
1      100
2     1000
3    10000
dtype: int64

In [7]:
type(S)

pandas.core.series.Series

In [8]:
S.index = ['a','b','c','d','e']   # changing index value

In [9]:
S

a    23
b    45
c    67
d    89
e    97
dtype: int64

In [10]:
S = pd.Series([98,87,76,65,23],index=['Naveen','Kumar','Sonu','Raj','Ram'])

In [11]:
S

Naveen    98
Kumar     87
Sonu      76
Raj       65
Ram       23
dtype: int64

In [12]:
# create a series with 10 random numbers with indexes 1 to 10

In [48]:
N = np.random.random(10)
I = np.arange(1,11)
s3 = pd.Series(np.random.random(10),np.arange(1,11))
s3

1     0.994690
2     0.575020
3     0.587040
4     0.984277
5     0.546109
6     0.255132
7     0.133775
8     0.125436
9     0.261919
10    0.641814
dtype: float64

In [14]:
s3.shape

(10,)

In [15]:
s3.ndim

1

In [16]:
s3.size

10

# Accessing series items using Index

In [17]:
S

Naveen    98
Kumar     87
Sonu      76
Raj       65
Ram       23
dtype: int64

In [18]:
S[:]

Naveen    98
Kumar     87
Sonu      76
Raj       65
Ram       23
dtype: int64

In [19]:
S['Naveen']

np.int64(98)

In [20]:
S[0:1]

Naveen    98
dtype: int64

In [21]:
S[0:3]

Naveen    98
Kumar     87
Sonu      76
dtype: int64

In [22]:
S[::-1]

Ram       23
Raj       65
Sonu      76
Kumar     87
Naveen    98
dtype: int64

# Operation on Series Data

In [23]:
S = pd.Series([23,45,67,89,97])

In [24]:
S.mean()

np.float64(64.2)

In [25]:
np.median(S)

np.float64(67.0)

In [26]:
S.sum()

np.int64(321)

In [27]:
S.min()

np.int64(23)

In [28]:
S.max()

np.int64(97)

In [29]:
S.argmax()   # index of max value in a series

np.int64(4)

In [30]:
S.argmin()

np.int64(0)

In [31]:
# Addition of two series

S1 = pd.Series([12,23,34,45,67])
S2 = pd.Series([1,2,3,4,5])

In [32]:
S1.add(S2)   # Addition

0    13
1    25
2    37
3    49
4    72
dtype: int64

In [33]:
S1.sub(S2)   # Subtraction

0    11
1    21
2    31
3    41
4    62
dtype: int64

In [34]:
S1.mul(S2)   # Multiplication

0     12
1     46
2    102
3    180
4    335
dtype: int64

In [35]:
S1.divide(S2)  # Division

0    12.000000
1    11.500000
2    11.333333
3    11.250000
4    13.400000
dtype: float64

In [36]:
S1.sort_values()  # sort values in ascending order

0    12
1    23
2    34
3    45
4    67
dtype: int64

In [37]:
S3 = pd.Series([1,1,2,2,2,2,2,2,5,5,5,6,6,6,6,6,6,6,7,7,7,8,8,9,9,9,9])

In [38]:
S3.value_counts()   # return count no. of occurances of each items in a series

6    7
2    6
9    4
7    3
5    3
1    2
8    2
Name: count, dtype: int64

# Data Frame

In [39]:
# Creating a dataframe

In [40]:
df = pd.DataFrame(['Nav','Kumar','Raj','Sonu'])
df

Unnamed: 0,0
0,Nav
1,Kumar
2,Raj
3,Sonu


In [41]:
type(df)

pandas.core.frame.DataFrame

In [42]:
df[1] = [23,45,89,67]
df

Unnamed: 0,0,1
0,Nav,23
1,Kumar,45
2,Raj,89
3,Sonu,67


In [52]:
df.columns

RangeIndex(start=0, stop=2, step=1)

In [58]:
df.columns={'Name',"marks"}  # renaming column name

In [59]:
df.columns

Index(['marks', 'Name'], dtype='object')

In [60]:
df['Name']   # accessing columns with their names

0    23
1    45
2    89
3    67
Name: Name, dtype: int64

In [61]:
df['marks']

0      Nav
1    Kumar
2      Raj
3     Sonu
Name: marks, dtype: object

In [None]:
df.rename(columns={"Name":"marks","marks":"Name"})

TypeError: 'list' object is not callable

In [None]:
# Adding columns to dataframe

df['%'] = ['67%','56%','89%','90']
df

Unnamed: 0,Name,Marks,%
0,Nav,23,67%
1,Kumar,45,56%
2,Raj,89,89%
3,Sonu,67,90


In [70]:
D = {'Name':['Pavan','Kumar','Rushi','Suresh','Raju'],
    'YOP':[2022,2023,2024,2020,2021],
    'total_marks':[1000,1000,1000,1000,1000],
    'scored_marks':[999,888,777,666,678],
    'percentage' : ['88.9','77.8','66.7','90.6','89.6']}

In [71]:
df1 = pd.DataFrame(D)
df1

Unnamed: 0,Name,YOP,total_marks,scored_marks,percentage
0,Pavan,2022,1000,999,88.9
1,Kumar,2023,1000,888,77.8
2,Rushi,2024,1000,777,66.7
3,Suresh,2020,1000,666,90.6
4,Raju,2021,1000,678,89.6


In [72]:
df1['scored_marks']

0    999
1    888
2    777
3    666
4    678
Name: scored_marks, dtype: int64

In [73]:
df1[['total_marks','scored_marks']]

Unnamed: 0,total_marks,scored_marks
0,1000,999
1,1000,888
2,1000,777
3,1000,666
4,1000,678


In [74]:
df1.iloc[0]   # accessing by index location....

Name            Pavan
YOP              2022
total_marks      1000
scored_marks      999
percentage       88.9
Name: 0, dtype: object

In [77]:
df1.loc[0]

Name            Pavan
YOP              2022
total_marks      1000
scored_marks      999
percentage       88.9
Name: 0, dtype: object

In [None]:
df1

Unnamed: 0,Name,YOP,total_marks,scored_marks,percentage
0,Pavan,2022,1000,999,88.9
1,Kumar,2023,1000,888,77.8
2,Rushi,2024,1000,777,66.7
3,Suresh,2020,1000,666,90.6
4,Raju,2021,1000,678,89.6


In [None]:
df1.index = ['a','b','c','d','e']  # changing index
df1

Unnamed: 0,Name,YOP,total_marks,scored_marks,percentage
a,Pavan,2022,1000,999,88.9
b,Kumar,2023,1000,888,77.8
c,Rushi,2024,1000,777,66.7
d,Suresh,2020,1000,666,90.6
e,Raju,2021,1000,678,89.6


In [None]:
df1.iloc[0]

Name            Pavan
YOP              2022
total_marks      1000
scored_marks      999
percentage       88.9
Name: a, dtype: object

In [None]:
df1.loc['b']

Name            Kumar
YOP              2023
total_marks      1000
scored_marks      888
percentage       77.8
Name: b, dtype: object

In [None]:
df1['a':'c']

Unnamed: 0,Name,YOP,total_marks,scored_marks,percentage
a,Pavan,2022,1000,999,88.9
b,Kumar,2023,1000,888,77.8
c,Rushi,2024,1000,777,66.7


# Loading Dataset

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('C:/Users/pavan/OneDrive/Desktop/iris.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/pavan/OneDrive/Desktop/iris.csv'

In [None]:
df

NameError: name 'df' is not defined

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.head()     # top 5 rows

In [None]:
df.head(30)    # first 30 rows 

In [None]:
df.tail()   # bottom 5 rows

In [None]:
df.tail(10)   # last 10 rows

In [None]:
df.dtypes  # data types of each column

In [None]:
df.shape    # dimensions of dataset

In [None]:
df.index    # index of dataframe

In [None]:
df.isna().sum()   # sum of null value in each column

In [None]:
df.isnull().any()   # check for null values in each column

In [None]:
# accessing column data by column name

In [None]:
df['sepal.length']

In [None]:
df[['sepal.length','sepal.width']]   # accessing multiple columns

In [None]:
df['sepal.length'].head(10)

In [None]:
df[['sepal.length','sepal.width']].head(10)

In [None]:
# return sepal.length values which is greater than 5

df[df['sepal.length']>5]

In [None]:
df['sepal.length'][df['sepal.length']>5]

In [None]:
a = df['sepal.length']
a[a>5]

In [None]:
for i in df['sepal.length']:
    if i>5:
        print(i,end='')
    

In [None]:
# find mean and median of sepal.width column of iris dataset

In [None]:
df['sepal.width'].mean()

In [None]:
import numpy as np

In [None]:
np.median(df['sepal.width'])

In [None]:
# variance

df['sepal.width'].var()

In [None]:
# standard deviation

df['sepal.width'].std()

In [None]:
# sum of column values

df['sepal.width'].sum()

In [None]:
df.columns

In [None]:
# occurance of each category

df['variety'].value_counts()

In [None]:
df.describe()   # statistical summary

In [None]:
df['sepal.length'].sort_values()  # sort column values in ascending order

In [None]:
df['sepal.length'].sort_values(ascending=False)   # sort in descending

In [None]:
# sort petal.width and petal.length in descending order

In [None]:
df[['petal.width','petal.length']].sort_values(by = 'petal.width',ascending=False)

In [None]:
df.sort_values(by='sepal.width')   # sorting a data based on particular column

In [None]:
df.sort_values(by='sepal.width').reset_index()

In [None]:
# accessing dataframe rows (slicing)

In [None]:
df.iloc[0:10]

In [None]:
df.iloc[0:10,0:3]

NameError: name 'df' is not defined

In [None]:
# accessing columns (slicing)

In [None]:
df.iloc[:,0:2]  # all rows, 2 columns....  (:) means all

In [None]:
# all rows, first 4 columns

df.iloc[:,0:4]

In [None]:
import datetime

In [None]:
x = datetime.datetime.now()
print(x)

2024-09-13 10:49:44.408608


In [None]:
dates = pd.date_range(start='2024-09-13',end='2024-09-19')
dates

DatetimeIndex(['2024-09-13', '2024-09-14', '2024-09-15', '2024-09-16',
               '2024-09-17', '2024-09-18', '2024-09-19'],
              dtype='datetime64[ns]', freq='D')

In [None]:
dates = pd.date_range('today',periods=7)
dates

DatetimeIndex(['2024-09-13 10:49:45.589577', '2024-09-14 10:49:45.589577',
               '2024-09-15 10:49:45.589577', '2024-09-16 10:49:45.589577',
               '2024-09-17 10:49:45.589577', '2024-09-18 10:49:45.589577',
               '2024-09-19 10:49:45.589577'],
              dtype='datetime64[ns]', freq='D')

In [None]:
dates = pd.date_range(start='2024-09-13',periods=7)
dates

DatetimeIndex(['2024-09-13', '2024-09-14', '2024-09-15', '2024-09-16',
               '2024-09-17', '2024-09-18', '2024-09-19'],
              dtype='datetime64[ns]', freq='D')

In [None]:
# create a dataframe of 7 rows x 7 random numbers and use dates as index

In [None]:
N = np.random.random([7,7])
Id = pd.date_range(start='2024-09-13',end='2024-09-19')
df = pd.DataFrame(N,index=Id)
df

Unnamed: 0,0,1,2,3,4,5,6
2024-09-13,0.581748,0.369102,0.12908,0.506137,0.908257,0.703553,0.214969
2024-09-14,0.629581,0.057657,0.41918,0.153842,0.395561,0.04832,0.11596
2024-09-15,0.204096,0.722278,0.62078,0.211765,0.024966,0.484202,0.258139
2024-09-16,0.88474,0.473502,0.468473,0.401003,0.12524,0.537986,0.156252
2024-09-17,0.847656,0.63789,0.472013,0.091788,0.694942,0.815701,0.327778
2024-09-18,0.286106,0.618493,0.57704,0.298102,0.852292,0.118423,0.763785
2024-09-19,0.00604,0.175489,0.86743,0.591493,0.354563,0.888056,0.424215


In [None]:
#change column names as C1,C2,C3...C7

In [None]:
df.columns=['C1','C2','C3','C4','C5','C6','C7']
df

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2024-09-13,0.581748,0.369102,0.12908,0.506137,0.908257,0.703553,0.214969
2024-09-14,0.629581,0.057657,0.41918,0.153842,0.395561,0.04832,0.11596
2024-09-15,0.204096,0.722278,0.62078,0.211765,0.024966,0.484202,0.258139
2024-09-16,0.88474,0.473502,0.468473,0.401003,0.12524,0.537986,0.156252
2024-09-17,0.847656,0.63789,0.472013,0.091788,0.694942,0.815701,0.327778
2024-09-18,0.286106,0.618493,0.57704,0.298102,0.852292,0.118423,0.763785
2024-09-19,0.00604,0.175489,0.86743,0.591493,0.354563,0.888056,0.424215


In [None]:
# access data of dates from 13 to 16 by using row lables-loc

In [None]:
df.loc['2024-09-13':'2024-09-16']

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2024-09-13,0.581748,0.369102,0.12908,0.506137,0.908257,0.703553,0.214969
2024-09-14,0.629581,0.057657,0.41918,0.153842,0.395561,0.04832,0.11596
2024-09-15,0.204096,0.722278,0.62078,0.211765,0.024966,0.484202,0.258139
2024-09-16,0.88474,0.473502,0.468473,0.401003,0.12524,0.537986,0.156252


In [None]:
# access first 3  rows using iloc

df.iloc[0:3]

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7
2024-09-13,0.581748,0.369102,0.12908,0.506137,0.908257,0.703553,0.214969
2024-09-14,0.629581,0.057657,0.41918,0.153842,0.395561,0.04832,0.11596
2024-09-15,0.204096,0.722278,0.62078,0.211765,0.024966,0.484202,0.258139


In [None]:
# accessing data of dates 14th to 19th in C4 column

In [None]:
df.loc['2024-09-14':'2024-09-19','C4']

2024-09-14    0.153842
2024-09-15    0.211765
2024-09-16    0.401003
2024-09-17    0.091788
2024-09-18    0.298102
2024-09-19    0.591493
Freq: D, Name: C4, dtype: float64

In [None]:
df.loc['2024-09-14':'2024-09-19','C4':'C5']

Unnamed: 0,C4,C5
2024-09-14,0.153842,0.395561
2024-09-15,0.211765,0.024966
2024-09-16,0.401003,0.12524
2024-09-17,0.091788,0.694942
2024-09-18,0.298102,0.852292
2024-09-19,0.591493,0.354563
