### PANDAS : tabular representation of data, it has properties like size mutability, statistical analysis, efficient manipulation and extraction, etc

In [1]:
import pandas as pd      # pd is an aias for pandas

In [2]:
print("Pandas version :", pd.__version__)

Pandas version : 2.3.0


### Primary Data Structures :
##### 1. Series : 1D data structure, represents a single column from the entire dataset, homogenous array
##### 2. DataFrames : 2D data  structure, represents the entire dataset, heterogeneously typed columns.  

# SERIES :

In [3]:
s = pd.Series([10, 20, 30, 40, 50]) 
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

##### Some basic operations :

In [4]:
s.dtype

dtype('int64')

In [5]:
s.values

array([10, 20, 30, 40, 50])

In [6]:
s.index

RangeIndex(start=0, stop=5, step=1)

In [7]:
print(s.name)       # prints None , because we have not given any name to a column

None


In [8]:
# assigning a name to a column 

s.name = "numbers"
s

0    10
1    20
2    30
3    40
4    50
Name: numbers, dtype: int64

##### INDEXING : by using square brackets 

In [9]:
s[0]

np.int64(10)

In [10]:
s[0:2]                        # index 2 is exclusive ,will only print index 0 and 1

0    10
1    20
Name: numbers, dtype: int64

In [11]:
# iloc : used for location based indexing 

s.iloc[4]

np.int64(50)

In [12]:
s.iloc[[1,2,3]]

1    20
2    30
3    40
Name: numbers, dtype: int64

In [13]:
#we can change our indexes   

index = ["apple", "banana", "grapes", "orange", "litchi"]   

In [14]:
s.index = index          # since , we have changed the indexes, iloc will not work here 
s

apple     10
banana    20
grapes    30
orange    40
litchi    50
Name: numbers, dtype: int64

In [15]:
s['grapes']

np.int64(30)

In [16]:
# loc : labelled based indexing , in this case start and end both the values are included 

s.loc['grapes']

np.int64(30)

In [17]:
s.loc[['apple', 'grapes']]

apple     10
grapes    30
Name: numbers, dtype: int64

In [18]:
s.loc['apple':'orange']

apple     10
banana    20
grapes    30
orange    40
Name: numbers, dtype: int64

##### Creating a series using dictionary :

In [19]:
fruit_calories = {
    "apple" : 20.6,
    "banana" : 25.7,
    "mango" : 10.0,
    "orange" : 5.77,
    "pineapple" : 17.8,
    "kiwi" : 12.2,
    "pomegranate" : 19.89,
    "grapes" : 29.9
    
    }


In [20]:
s2 = pd.Series(fruit_calories, name = "calories")

In [21]:
s2

apple          20.60
banana         25.70
mango          10.00
orange          5.77
pineapple      17.80
kiwi           12.20
pomegranate    19.89
grapes         29.90
Name: calories, dtype: float64

In [22]:
# Conditional Selection :

s2 > 10                        

apple           True
banana          True
mango          False
orange         False
pineapple       True
kiwi            True
pomegranate     True
grapes          True
Name: calories, dtype: bool

In [23]:
s2[s2>10]              # will print fruits which has calories greater than 10

apple          20.60
banana         25.70
pineapple      17.80
kiwi           12.20
pomegranate    19.89
grapes         29.90
Name: calories, dtype: float64

In [24]:
# using logical operators : and, or, not 

[[s2>10] and [s2>20]]



[[apple           True
  banana          True
  mango          False
  orange         False
  pineapple      False
  kiwi           False
  pomegranate    False
  grapes          True
  Name: calories, dtype: bool]]

In [25]:
# Modifying the series :
s2["mango"] = 10.6

In [26]:
s2

apple          20.60
banana         25.70
mango          10.60
orange          5.77
pineapple      17.80
kiwi           12.20
pomegranate    19.89
grapes         29.90
Name: calories, dtype: float64

# DataFrames :

In [39]:
data = {
    "Name" : ["deepak", "bob", "charlie", "yuvraj", "alice"],
    "Age" : [20, 24, 15, 10, 19],
    "Marks" : [66, 40, 39, 90, 14],
    "Department" : ["CSE", "Electrical", "Electronics", "Civil", "Mechanical"]
}

In [40]:
data                                      # not very neat and clear, so we will convert it into a dataframe

{'Name': ['deepak', 'bob', 'charlie', 'yuvraj', 'alice'],
 'Age': [20, 24, 15, 10, 19],
 'Marks': [66, 40, 39, 90, 14],
 'Department': ['CSE', 'Electrical', 'Electronics', 'Civil', 'Mechanical']}

In [41]:
df = pd.DataFrame(data)

df                                # will print the dataframe

Unnamed: 0,Name,Age,Marks,Department
0,deepak,20,66,CSE
1,bob,24,40,Electrical
2,charlie,15,39,Electronics
3,yuvraj,10,90,Civil
4,alice,19,14,Mechanical


In [42]:
df.head(2)                   # will print the starting two rows

Unnamed: 0,Name,Age,Marks,Department
0,deepak,20,66,CSE
1,bob,24,40,Electrical


In [43]:
df.tail(2)                # will print last two rows  

Unnamed: 0,Name,Age,Marks,Department
3,yuvraj,10,90,Civil
4,alice,19,14,Mechanical


In [44]:
df.iloc[1:3]

Unnamed: 0,Name,Age,Marks,Department
1,bob,24,40,Electrical
2,charlie,15,39,Electronics


In [45]:
df.loc[1:3]

Unnamed: 0,Name,Age,Marks,Department
1,bob,24,40,Electrical
2,charlie,15,39,Electronics
3,yuvraj,10,90,Civil


In [34]:
df.loc[1:3, ["Name", "Age"]]     # will print first 3 rows of name and age 

Unnamed: 0,Name,Age
1,bob,24
2,charlie,15
3,yuvraj,10


In [35]:
df.iloc[1:3, :2]               # [rows, column]

Unnamed: 0,Name,Age
1,bob,24
2,charlie,15


In [62]:
# To access one particular column , columns are also known as features 

df["Department"]

0            CSE
1     Electrical
2    Electronics
3          Civil
4     Mechanical
Name: Department, dtype: object

In [47]:
# For accessing multiple columns 

df[["Name", "Department"]]              #dont forget to put the inside square brackets 

Unnamed: 0,Name,Department
0,deepak,CSE
1,bob,Electrical
2,charlie,Electronics
3,yuvraj,Civil
4,alice,Mechanical


In [None]:
# to delete a particular column (axis = 1) 

df.drop("Age", axis=1)

Unnamed: 0,Name,Marks,Department
0,deepak,66,CSE
1,bob,40,Electrical
2,charlie,39,Electronics
3,yuvraj,90,Civil
4,alice,14,Mechanical


In [52]:
df      # age column has not been dropped from the original dataframe, to do so , use the parameter inplace = True .

Unnamed: 0,Name,Age,Marks,Department
0,deepak,20,66,CSE
1,bob,24,40,Electrical
2,charlie,15,39,Electronics
3,yuvraj,10,90,Civil
4,alice,19,14,Mechanical


In [54]:
df.shape    # will give you the number of rows and columns , and indexes are not counted as a part of column 

(5, 4)

In [57]:
df.dtypes       # tells you the data type 

Name          object
Age            int64
Marks          int64
Department    object
dtype: object

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5 non-null      object
 1   Age         5 non-null      int64 
 2   Marks       5 non-null      int64 
 3   Department  5 non-null      object
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes


In [61]:
df.describe()      # will give the statistical analysis of the dataframe 

Unnamed: 0,Age,Marks
count,5.0,5.0
mean,17.6,49.8
std,5.319774,29.037906
min,10.0,14.0
25%,15.0,39.0
50%,19.0,40.0
75%,20.0,66.0
max,24.0,90.0


##### Broadcasting in Pandas 

In [66]:
df["Marks"] = df["Marks"] + 100      # addition operation has been performed 

In [67]:
df["Marks"]

0    266
1    240
2    239
3    290
4    214
Name: Marks, dtype: int64

In [68]:
# Renaming the columns :

df.rename(columns = {"Marks": "mkrs"})

Unnamed: 0,Name,Age,mkrs,Department
0,deepak,20,266,CSE
1,bob,24,240,Electrical
2,charlie,15,239,Electronics
3,yuvraj,10,290,Civil
4,alice,19,214,Mechanical


In [69]:
df.rename(columns = {"Marks": "mkrs"}, inplace = True)   # this will change the original dataframe

In [70]:
df

Unnamed: 0,Name,Age,mkrs,Department
0,deepak,20,266,CSE
1,bob,24,240,Electrical
2,charlie,15,239,Electronics
3,yuvraj,10,290,Civil
4,alice,19,214,Mechanical


In [71]:
# to check unique values in a particular column : 

df["mkrs"].unique()

array([266, 240, 239, 290, 214])

In [74]:
# to check distributions of data 

df["Department"].value_counts()                

Department
CSE            1
Electrical     1
Electronics    1
Civil          1
Mechanical     1
Name: count, dtype: int64

In [76]:
# to create a new column in a dataset 

df["extra marks"] = df["mkrs"] * 2

In [77]:
df

Unnamed: 0,Name,Age,mkrs,Department,extra marks
0,deepak,20,266,CSE,532
1,bob,24,240,Electrical,480
2,charlie,15,239,Electronics,478
3,yuvraj,10,290,Civil,580
4,alice,19,214,Mechanical,428


In [78]:
# to replace a value in a column 
df["Name"].replace("bob", "Charlie")

0     deepak
1    Charlie
2    charlie
3     yuvraj
4      alice
Name: Name, dtype: object

In [79]:
df

Unnamed: 0,Name,Age,mkrs,Department,extra marks
0,deepak,20,266,CSE,532
1,bob,24,240,Electrical,480
2,charlie,15,239,Electronics,478
3,yuvraj,10,290,Civil,580
4,alice,19,214,Mechanical,428


In [80]:
df["Name"].replace("bob", "Charlie", inplace= True)

In [81]:
df

Unnamed: 0,Name,Age,mkrs,Department,extra marks
0,deepak,20,266,CSE,532
1,Charlie,24,240,Electrical,480
2,charlie,15,239,Electronics,478
3,yuvraj,10,290,Civil,580
4,alice,19,214,Mechanical,428


##### Apply and lamda functions : they let you change single entries of a row or a column based on some conditions 


In [84]:
df["mkrs"] = df["mkrs"].apply(lambda x: x/2)

df

Unnamed: 0,Name,Age,mkrs,Department,extra marks
0,deepak,20,66.5,CSE,532
1,Charlie,24,60.0,Electrical,480
2,charlie,15,59.75,Electronics,478
3,yuvraj,10,72.5,Civil,580
4,alice,19,53.5,Mechanical,428
