In [2]:
import numpy as np
import pandas as pd
data = {
    "total_bill":[16.99,10.34,21.01],
    "tip":[1.01,1.66,3.5,],
    "sex":['Female','Male','Male',],
    "smoker":['No','No','No',],
    "day":['Sun','Sun','Sun',],
    "time":['Dinner','Dinner','Dinner',],
    "size":[2,3,3],

}
df = pd.DataFrame(data)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3


In [3]:
df[['total_bill', 'tip']]  # Returns a DataFrame with only 'total_bill' and 'tip' columns.

Unnamed: 0,total_bill,tip
0,16.99,1.01
1,10.34,1.66
2,21.01,3.5


In [4]:
print(df.iloc[0])# Returns the first row as a Series.
print(df.iloc[0:2])  # Returns the first two rows as a DataFrame.

total_bill     16.99
tip             1.01
sex           Female
smoker            No
day              Sun
time          Dinner
size               2
Name: 0, dtype: object
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3


In [7]:
# df[df['total_bill'] > 15]  # Returns rows where 'total_bill' is greater than 15.
df[(df['total_bill'] > 15) & (df['sex'] == 'Male')]  # Multiple conditions.

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


In [8]:
# df.groupby('day')['total_bill'].mean()  # Calculates the mean 'total_bill' for each day.
df.groupby(['day', 'smoker']).agg({'total_bill': 'mean', 'tip': 'sum'})  # Multiple aggregations.


Unnamed: 0_level_0,Unnamed: 1_level_0,total_bill,tip
day,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1
Sun,No,16.113333,6.17


In [9]:
df.drop(index=[0, 1])  # Drops the rows at index 0 and 1.

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
2,21.01,3.5,Male,No,Sun,Dinner,3


In [11]:
df['tip'] = df['tip'] * 1.1  # Increases all tips by 10%.
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.2221,Female,No,Sun,Dinner,2
1,10.34,2.0086,Male,No,Sun,Dinner,3
2,21.01,4.235,Male,No,Sun,Dinner,3


In [12]:
df['total_amount'] = df['total_bill'] + df['tip']  # Creates a new column by adding two existing columns.
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount
0,16.99,1.2221,Female,No,Sun,Dinner,2,18.2121
1,10.34,2.0086,Male,No,Sun,Dinner,3,12.3486
2,21.01,4.235,Male,No,Sun,Dinner,3,25.245


In [13]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C'], 'value1': [1, 2, 3]})
df2 = pd.DataFrame({'key': ['A', 'B', 'D'], 'value2': [4, 5, 6]})
pd.merge(df1, df2, on='key', how='inner')  # Merges df1 and df2 on 'key' with inner join.


Unnamed: 0,key,value1,value2
0,A,1,4
1,B,2,5


In [14]:
df1.set_index('key').join(df2.set_index('key'), how='inner')

Unnamed: 0_level_0,value1,value2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,1,4
B,2,5


In [16]:
df.dropna()  # Drops all rows with any NaN values.
df.dropna(subset=['total_bill', 'tip'])  # Drops rows where 'total_bill' or 'tip' have NaNs.


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount
0,16.99,1.2221,Female,No,Sun,Dinner,2,18.2121
1,10.34,2.0086,Male,No,Sun,Dinner,3,12.3486
2,21.01,4.235,Male,No,Sun,Dinner,3,25.245


In [17]:
df['sex'] = df['sex'].map({'Male': 1, 'Female': 0})  # Converts 'sex' to binary values.
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount
0,16.99,1.2221,0,No,Sun,Dinner,2,18.2121
1,10.34,2.0086,1,No,Sun,Dinner,3,12.3486
2,21.01,4.235,1,No,Sun,Dinner,3,25.245


In [19]:
df['smoker'].replace({'No': 'Non-smoker', 'Yes': 'Smoker'}, inplace=True)
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount
0,16.99,1.2221,0,Non-smoker,Sun,Dinner,2,18.2121
1,10.34,2.0086,1,Non-smoker,Sun,Dinner,3,12.3486
2,21.01,4.235,1,Non-smoker,Sun,Dinner,3,25.245


In [21]:
df['big_tip'] = np.where(df['tip'] > 3, 'Yes', 'No')
df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount,big_tip
0,16.99,1.2221,0,Non-smoker,Sun,Dinner,2,18.2121,No
1,10.34,2.0086,1,Non-smoker,Sun,Dinner,3,12.3486,No
2,21.01,4.235,1,Non-smoker,Sun,Dinner,3,25.245,Yes


In [22]:
df.groupby('day').agg({'total_bill': ['mean', 'sum'], 'tip': 'max'})


Unnamed: 0_level_0,total_bill,total_bill,tip
Unnamed: 0_level_1,mean,sum,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Sun,16.113333,48.34,4.235


In [24]:
df['tip_category'] = pd.cut(df['tip'], bins=[0, 2, 4, 6], labels=['Low', 'Medium', 'High'])
df
# The first bin includes values from 0 up to (but not including) 2.
# The second bin includes values from 2 up to (but not including) 4.
# The third bin includes values from 4 up to (but not including) 6.

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_amount,big_tip,tip_category
0,16.99,1.2221,0,Non-smoker,Sun,Dinner,2,18.2121,No,Low
1,10.34,2.0086,1,Non-smoker,Sun,Dinner,3,12.3486,No,Medium
2,21.01,4.235,1,Non-smoker,Sun,Dinner,3,25.245,Yes,High
