In [1]:
# The exclamation mark (!) in front of a command is often used in interactive environments 
# like Jupyter Notebooks or IPython to indicate that the following statement is a 
# shell command rather than a Python command. 

!pip install pandas



In [1]:
# importing pandas
import pandas as pd

In [2]:
df = pd.DataFrame({
    "Id": [1, 2, 3, 4, 5],
    "Age": [12, 13, 14, 15, 16],
    "Gender": ["M", "F", "M", "F", "M"]
})

In [3]:
df

Unnamed: 0,Id,Age,Gender
0,1,12,M
1,2,13,F
2,3,14,M
3,4,15,F
4,5,16,M


In [5]:
# Commom functions of pandas

In [7]:
# df.head() - first 5 rows
df.head(3) # first 3 rows

Unnamed: 0,Id,Age,Gender
0,1,12,M
1,2,13,F
2,3,14,M


In [9]:
# df.tail() - last 5 rows
df.tail(2) # last 2 rows

Unnamed: 0,Id,Age,Gender
3,4,15,F
4,5,16,M


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Id      5 non-null      int64 
 1   Age     5 non-null      int64 
 2   Gender  5 non-null      object
dtypes: int64(2), object(1)
memory usage: 252.0+ bytes


In [11]:
# Descriptive stats - here Id col stats has no meaning
df.describe()

Unnamed: 0,Id,Age
count,5.0,5.0
mean,3.0,14.0
std,1.581139,1.581139
min,1.0,12.0
25%,2.0,13.0
50%,3.0,14.0
75%,4.0,15.0
max,5.0,16.0


In [12]:
df["Gender"] # series data

0    M
1    F
2    M
3    F
4    M
Name: Gender, dtype: object

In [13]:
df[["Gender"]] # it is a data frame

Unnamed: 0,Gender
0,M
1,F
2,M
3,F
4,M


In [16]:
df["Gender"].value_counts()

Gender
M    3
F    2
Name: count, dtype: int64

In [17]:
df.loc[3, "Gender"] #row index, col name

'F'

In [23]:
df.iloc[1, 2] #row index, col index

'F'

In [19]:
df

Unnamed: 0,Id,Age,Gender
0,1,12,M
1,2,13,F
2,3,14,M
3,4,15,F
4,5,16,M


In [24]:
df2 = pd.DataFrame({"Id": [1, 2, 10], "Income":[300, 400,  500]})

In [25]:
df2

Unnamed: 0,Id,Income
0,1,300
1,2,400
2,10,500


In [27]:
merged_left = pd.merge(df, df2, on="Id", how="left")
merged_left

Unnamed: 0,Id,Age,Gender,Income
0,1,12,M,300.0
1,2,13,F,400.0
2,3,14,M,
3,4,15,F,
4,5,16,M,


In [29]:
merged_right = pd.merge(df, df2, on="Id", how="right")
merged_right

Unnamed: 0,Id,Age,Gender,Income
0,1,12.0,M,300
1,2,13.0,F,400
2,10,,,500


In [30]:
merged_inner = pd.merge(df, df2, on="Id", how="inner")
merged_inner

Unnamed: 0,Id,Age,Gender,Income
0,1,12,M,300
1,2,13,F,400


In [31]:
merged_outer = pd.merge(df, df2, on="Id", how="outer")
merged_outer

Unnamed: 0,Id,Age,Gender,Income
0,1,12.0,M,300.0
1,2,13.0,F,400.0
2,3,14.0,M,
3,4,15.0,F,
4,5,16.0,M,
5,10,,,500.0


In [32]:
merged_outer.isna()

Unnamed: 0,Id,Age,Gender,Income
0,False,False,False,False
1,False,False,False,False
2,False,False,False,True
3,False,False,False,True
4,False,False,False,True
5,False,True,True,False


In [34]:
type(merged_inner.loc[1, "Gender"])

str

In [35]:
merged_inner["Gender"].value_counts()

Gender
M    1
F    1
Name: count, dtype: int64

In [36]:
merged_outer

Unnamed: 0,Id,Age,Gender,Income
0,1,12.0,M,300.0
1,2,13.0,F,400.0
2,3,14.0,M,
3,4,15.0,F,
4,5,16.0,M,
5,10,,,500.0


In [37]:
print(merged_outer)

   Id   Age Gender  Income
0   1  12.0      M   300.0
1   2  13.0      F   400.0
2   3  14.0      M     NaN
3   4  15.0      F     NaN
4   5  16.0      M     NaN
5  10   NaN    NaN   500.0


In [40]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Id      3 non-null      int64
 1   Income  3 non-null      int64
dtypes: int64(2)
memory usage: 180.0 bytes


In [39]:
merged_outer.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      6 non-null      int64  
 1   Age     5 non-null      float64
 2   Gender  5 non-null      object 
 3   Income  3 non-null      float64
dtypes: float64(2), int64(1), object(1)
memory usage: 324.0+ bytes


In [41]:
obj = merged_outer.select_dtypes(include="object")
obj

Unnamed: 0,Gender
0,M
1,F
2,M
3,F
4,M
5,


In [43]:
# Filtering
merged_outer[merged_outer["Gender"] == "F"]

Unnamed: 0,Id,Age,Gender,Income
1,2,13.0,F,400.0
3,4,15.0,F,


In [45]:
# Group by
group = merged_outer.groupby("Gender")["Age"].mean()
group

Gender
F    14.0
M    14.0
Name: Age, dtype: float64

In [49]:
# sorted
sorted = merged_outer.sort_values(by="Age", ascending=False)
sorted

Unnamed: 0,Id,Age,Gender,Income
4,5,16.0,M,
3,4,15.0,F,
2,3,14.0,M,
1,2,13.0,F,400.0
0,1,12.0,M,300.0
5,10,,,500.0
