In [1]:
import pandas as pd

In [2]:
df = pd.DataFrame({
    "A":[420, 380, 390],
    "B": [50, 40, 45]
})
df['C'] = df["A"] * df["B"]
df

Unnamed: 0,A,B,C
0,420,50,21000
1,380,40,15200
2,390,45,17550


In [3]:
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'subscribed': [True, False, True]
})
df

Unnamed: 0,name,subscribed
0,Alice,True
1,Bob,False
2,Charlie,True


In [4]:
df['subscribed'] = df['subscribed'].astype(int)
df

Unnamed: 0,name,subscribed
0,Alice,1
1,Bob,0
2,Charlie,1


In [5]:
df.columns
for col in df.columns:
    print(col)

name
subscribed


In [6]:
list(df.columns)

['name', 'subscribed']

In [7]:
list(df.columns.values)

['name', 'subscribed']

In [8]:
sorted(df)

['name', 'subscribed']

In [9]:

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'age': [25, 30, 35]
})
print(df.iloc[0])
df.iloc[0:2]  # First two rows
df.iloc[:, 1]  # All rows, second column

name    Alice
age        25
Name: 0, dtype: object


0    25
1    30
2    35
Name: age, dtype: int64

In [10]:
print(df.loc[0])
print(df['name'])
print(df.loc[:, 'age'])

name    Alice
age        25
Name: 0, dtype: object
0      Alice
1        Bob
2    Charlie
Name: name, dtype: object
0    25
1    30
2    35
Name: age, dtype: int64


In [11]:
import numpy as np
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
    'col2': [2, 1, 9, 8, 7, 4],
    'col3': [0, 1, 9, 4, 2, 3],
    'col4': ['a', 'B', 'c', 'D', 'e', 'F']
})

df.sort_values(by=['col1'])
df

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [12]:
df.sort_values(by=['col1', 'col2'])
df

Unnamed: 0,col1,col2,col3,col4
0,A,2,0,a
1,A,1,1,B
2,B,9,9,c
3,,8,4,D
4,D,7,2,e
5,C,4,3,F


In [13]:
df.sort_values(by='col1', ascending=False)

Unnamed: 0,col1,col2,col3,col4
4,D,7,2,e
5,C,4,3,F
2,B,9,9,c
0,A,2,0,a
1,A,1,1,B
3,,8,4,D


In [14]:
df = pd.DataFrame(np.random.rand(5,3), columns=['A', 'B', 'C'])
df

Unnamed: 0,A,B,C
0,0.163558,0.033916,0.145597
1,0.374932,0.310432,0.044898
2,0.993964,0.708378,0.128347
3,0.192911,0.515509,0.424899
4,0.55694,0.756924,0.770471


In [15]:
df['A'].idxmax()

2

In [16]:
df['A'].idxmin()

0

In [17]:
row_max_col = df.idxmax(axis=1)
print(row_max_col)
# Row-wise min
row_min_col = df.idxmin(axis=1)
print(row_min_col)

0    A
1    A
2    A
3    B
4    C
dtype: object
0    B
1    C
2    C
3    A
4    A
dtype: object


In [18]:
df = pd.DataFrame({
    'NAME': list('abcdef'),
    'On_Time': [True, False] * 3,       # Boolean column
    'On_Budget': [False, True] * 3      # Another boolean column
})
df

Unnamed: 0,NAME,On_Time,On_Budget
0,a,True,False
1,b,False,True
2,c,True,False
3,d,False,True
4,e,True,False
5,f,False,True


In [19]:
df.select_dtypes(include=['bool'])

Unnamed: 0,On_Time,On_Budget
0,True,False
1,False,True
2,True,False
3,False,True
4,True,False
5,False,True


In [20]:
mylist = list(df.select_dtypes(include=['bool']).columns)
mylist

['On_Time', 'On_Budget']

In [21]:
df = pd.DataFrame({
    'key': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'B', 'C'],
    'data': [0, 5, 10, 5, 10, 15, 10, 15, 20]
})
result = df.groupby('key')['data'].sum()
print(result)

key
A    15
B    30
C    45
Name: data, dtype: int64


In [22]:
df = pd.DataFrame({
    'row': [
        '10001 New York, NY',
        '20001 Washington, DC',
        '94105 San Francisco, CA'
    ]
})
print(df)

                       row
0       10001 New York, NY
1     20001 Washington, DC
2  94105 San Francisco, CA


In [23]:
df[['code', 'location']] = df['row'].str.split(n=1, expand=True)
df

Unnamed: 0,row,code,location
0,"10001 New York, NY",10001,"New York, NY"
1,"20001 Washington, DC",20001,"Washington, DC"
2,"94105 San Francisco, CA",94105,"San Francisco, CA"


In [24]:
if df.empty:
    print("DataFrame is empty")

In [25]:
df = pd.DataFrame({
    'c1': [10, 11, 12],
    'c2': [100, 110, 120]
})
for index, row in df.iterrows():
    print(index, row['c1'], row['c2'])

0 10 100
1 11 110
2 12 120


In [26]:
import matplotlib.pyplot as plt
df = pd.DataFrame({
    'salary': [30000, 35000, 40000, 45000, 50000, 1000000]
})

print(df.describe())

              salary
count        6.00000
mean    200000.00000
std     391982.14245
min      30000.00000
25%      36250.00000
50%      42500.00000
75%      48750.00000
max    1000000.00000


In [27]:
# Original DataFrame
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [90, 80, 70]
})

print("Original DataFrame:")
print(df)

# Without .copy(): Risk of modifying the original DataFrame
subset1 = df[df['score'] > 75]  # No copy used
subset1.loc[0, 'score'] = 999

print("\nWithout .copy() — modified subset1:")
print(subset1)

print("\nOriginal DataFrame after modifying subset1 (may or may not change):")
print(df)

# Reset original DataFrame for clean testing
df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [90, 80, 70]
})

# With .copy(): Safe to modify without affecting original
subset2 = df[df['score'] > 75].copy()
subset2.loc[0, 'score'] = 888

print("\nWith .copy() — modified subset2:")
print(subset2)

print("\nOriginal DataFrame after modifying subset2 (unchanged):")
print(df)


Original DataFrame:
      name  score
0    Alice     90
1      Bob     80
2  Charlie     70

Without .copy() — modified subset1:
    name  score
0  Alice    999
1    Bob     80

Original DataFrame after modifying subset1 (may or may not change):
      name  score
0    Alice     90
1      Bob     80
2  Charlie     70

With .copy() — modified subset2:
    name  score
0  Alice    888
1    Bob     80

Original DataFrame after modifying subset2 (unchanged):
      name  score
0    Alice     90
1      Bob     80
2  Charlie     70


In [28]:
s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
mapped_s = s.map({'cat': 'kitten', 'dog': 'puppy'})
print(mapped_s)

0    kitten
1     puppy
2       NaN
3       NaN
dtype: object


In [29]:
keep_v = s.map({'cat': 'kitten', 'dog': 'puppy'}).fillna(s)
keep_v

0    kitten
1     puppy
2       NaN
3    rabbit
dtype: object

In [30]:
df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
df.applymap(lambda x:x**2)

  df.applymap(lambda x:x**2)


Unnamed: 0,0,1
0,1.0,4.4944
1,11.262736,20.857489


In [31]:
df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
df

Unnamed: 0,A,B
0,4,9
1,4,9
2,4,9


In [32]:
df.apply(np.sqrt)

Unnamed: 0,A,B
0,2.0,3.0
1,2.0,3.0
2,2.0,3.0


In [33]:
df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
df1

Unnamed: 0,a,b
0,foo,1
1,bar,2


In [34]:
df2

Unnamed: 0,a,c
0,foo,3
1,baz,4


In [35]:
df1.merge(df2, how='inner', on='a')

Unnamed: 0,a,b,c
0,foo,1,3


In [36]:
df1.merge(df2, how="left", on='a')

Unnamed: 0,a,b,c
0,foo,1,3.0
1,bar,2,


In [37]:
df1.merge(df2, how='right', on='a')

Unnamed: 0,a,b,c
0,foo,1.0,3
1,baz,,4


In [38]:
df1.merge(df2, how='outer', on='a')

Unnamed: 0,a,b,c
0,bar,2.0,
1,baz,,4.0
2,foo,1.0,3.0


In [39]:
df = pd.DataFrame({
    'categories': {
        0: ['A', 'B'],
        1: ['B', 'C', 'D'],
        2: ['B', 'D']
    }
})

In [40]:
import collections
counts = df['categories'].apply(collections.Counter)
print(counts)
encoded = pd.DataFrame.from_records(counts).fillna(0).astype(int)
print(encoded)

0            {'A': 1, 'B': 1}
1    {'B': 1, 'C': 1, 'D': 1}
2            {'B': 1, 'D': 1}
Name: categories, dtype: object
   A  B  C  D
0  1  1  0  0
1  0  1  1  1
2  0  1  0  1


In [41]:
df = pd.DataFrame({
    'a': ['A', 'A', 'B', 'B', 'B', 'C'],
    'b': [1, 2, 5, 5, 4, 6]
})
df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
df1

Unnamed: 0,a,new
0,A,"[1, 2]"
1,B,"[5, 5, 4]"
2,C,[6]


In [42]:
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva'],
    'Country': ['US', 'UK', 'China', 'Germany', 'India']
})
df

Unnamed: 0,Name,Country
0,Alice,US
1,Bob,UK
2,Charlie,China
3,David,Germany
4,Eva,India


In [43]:
selected_countires = ['UK','China']
df1 = df[df['Country'].isin(selected_countires)]
df1

Unnamed: 0,Name,Country
1,Bob,UK
2,Charlie,China


In [44]:
df2 = df[~df['Country'].isin(selected_countires)]
df2

Unnamed: 0,Name,Country
0,Alice,US
3,David,Germany
4,Eva,India


In [45]:
df3 = df.query("Country in @selected_countires")
df3

Unnamed: 0,Name,Country
1,Bob,UK
2,Charlie,China


In [46]:
df4 = df.query("Country not in @selected_countires")
df4

Unnamed: 0,Name,Country
0,Alice,US
3,David,Germany
4,Eva,India


In [47]:
df = pd.DataFrame({
    'A': 'foo bar foo bar foo bar foo foo'.split(),
    'B': 'one one two three two two one three'.split(),
    'C': np.arange(8),
    'D': np.arange(8) * 2
})
df

Unnamed: 0,A,B,C,D
0,foo,one,0,0
1,bar,one,1,2
2,foo,two,2,4
3,bar,three,3,6
4,foo,two,4,8
5,bar,two,5,10
6,foo,one,6,12
7,foo,three,7,14


In [48]:
m = df['A'] != 'foo'

In [49]:
a, b = df[m], df[~m]
print(a, b)

     A      B  C   D
1  bar    one  1   2
3  bar  three  3   6
5  bar    two  5  10      A      B  C   D
0  foo    one  0   0
2  foo    two  2   4
4  foo    two  4   8
6  foo    one  6  12
7  foo  three  7  14


In [50]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
df = pd.DataFrame({
    'city': ['Paris', 'London', 'Berlin'],
    'age': [30, 25, 40]
})
ct = ColumnTransformer([
    ('encode_city', OneHotEncoder(), ['city'])
], remainder='passthrough')
encoded = ct.fit_transform(df)
encoded_df = pd.DataFrame(encoded, columns=ct.get_feature_names_out())

print(encoded_df)

   encode_city__city_Berlin  encode_city__city_London  \
0                       0.0                       0.0   
1                       0.0                       1.0   
2                       1.0                       0.0   

   encode_city__city_Paris  remainder__age  
0                      1.0            30.0  
1                      0.0            25.0  
2                      0.0            40.0  


In [51]:
from sklearn.preprocessing import LabelEncoder
df = pd.DataFrame({
    'city': ['Paris', 'London', 'Berlin', 'Paris', 'London']
})

le = LabelEncoder()

df['city_encoded'] = le.fit_transform(df['city'])

# Show result
print(df)

     city  city_encoded
0   Paris             2
1  London             1
2  Berlin             0
3   Paris             2
4  London             1


In [52]:
ages = [5, 15, 25, 35, 45]
bins = pd.cut(ages, bins=3)
print(bins)
scores = [55, 65, 75, 85, 95]
quantiles = pd.qcut(scores, q=3)
print(quantiles)

[(4.96, 18.333], (4.96, 18.333], (18.333, 31.667], (31.667, 45.0], (31.667, 45.0]]
Categories (3, interval[float64, right]): [(4.96, 18.333] < (18.333, 31.667] < (31.667, 45.0]]
[(54.999, 68.333], (54.999, 68.333], (68.333, 81.667], (81.667, 95.0], (81.667, 95.0]]
Categories (3, interval[float64, right]): [(54.999, 68.333] < (68.333, 81.667] < (81.667, 95.0]]


In [53]:
df = pd.DataFrame({
    'price': [10, 20, 30],
    'quantity': [1, 2, 3]
})

In [54]:
for index, row in df.iterrows():
    total = row['price'] * row['quantity']
    print(f"Row {index} total: {total}")

Row 0 total: 10
Row 1 total: 40
Row 2 total: 90


In [55]:
for row in df.itertuples(index=True):
    total = row.price * row.quantity
    print(f"Row {row.Index} total: {total}")

Row 0 total: 10
Row 1 total: 40
Row 2 total: 90


In [56]:
df['total'] = df.apply(lambda row:row['price'] * row['quantity'], axis=1)
print(df)

   price  quantity  total
0     10         1     10
1     20         2     40
2     30         3     90


In [57]:
df['total'] = df['price'] * df['quantity']
print(df)

   price  quantity  total
0     10         1     10
1     20         2     40
2     30         3     90


In [58]:
data = pd.Series(['1', '2', '3', 'invalid'])
pd.to_numeric(data, errors='coerce')

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [59]:
df = pd.DataFrame({'col1': ['1', '2', '3']})
df['col1'] = df['col1'].astype(int)

In [60]:
df = pd.DataFrame({
    "Col X": ['class 1', 'class 2', 'class 3', 'class 2'],
    "Col Y": ['cat 1', 'cat 1', 'cat 2', 'cat 3']
})
df

Unnamed: 0,Col X,Col Y
0,class 1,cat 1
1,class 2,cat 1
2,class 3,cat 2
3,class 2,cat 3


In [61]:
df.pivot_table(index='Col X', columns='Col Y', aggfunc=len, fill_value=0)

Col Y,cat 1,cat 2,cat 3
Col X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
class 1,1,0,0
class 2,1,0,1
class 3,0,1,0


In [62]:
df.groupby(['Col X', 'Col Y']).size().unstack('Col Y', fill_value=0)

Col Y,cat 1,cat 2,cat 3
Col X,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
class 1,1,0,0
class 2,1,0,1
class 3,0,1,0


In [63]:
import pandas as pd

df = pd.DataFrame({
    'Subject': ['Math', 'Science'],
    'Alice': [90, 95],
    'Bob': [85, 80]
})
df.set_index('Subject', inplace=True)
print(df)

         Alice  Bob
Subject            
Math        90   85
Science     95   80


In [64]:
df.stack()

Subject       
Math     Alice    90
         Bob      85
Science  Alice    95
         Bob      80
dtype: int64

In [65]:
df1 = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
}).set_index('id')

df2 = pd.DataFrame({
    'id': [1, 2],
    'age': [24, 30]
}).set_index('id')

In [66]:
result = df1.join(df2)
result

Unnamed: 0_level_0,name,age
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Alice,24.0
2,Bob,30.0
3,Charlie,


In [67]:
df1_reset = df1.reset_index()
df2_reset = df2.reset_index()
reseult = pd.merge(df1_reset, df2_reset, on="id")
reseult

Unnamed: 0,id,name,age
0,1,Alice,24
1,2,Bob,30


In [68]:
df1 = pd.DataFrame({'name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'name': ['Charlie', 'David']})

result = pd.concat([df1, df2], ignore_index=True)
print(result)

      name
0    Alice
1      Bob
2  Charlie
3    David


In [69]:
df1 = pd.DataFrame({'name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'age': [24, 30]})

result = pd.concat([df1, df2], axis=1)
print(result)

    name  age
0  Alice   24
1    Bob   30


In [70]:
df = pd.DataFrame(
    [[0, 2, 3],
     [0, 4, 1],
     [10, 20, 30]],
    columns=['A', 'B', 'C'],
    index=['first', 'second', 'third']
)

print(df)


         A   B   C
first    0   2   3
second   0   4   1
third   10  20  30


In [72]:
value_at = df.at['second', 'B']
print(value_at)

4


In [73]:
value_iat = df.iat[1,1]
print(value_at)

4


In [74]:
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan, 5]})
print(df.fillna(0))

     A
0  1.0
1  0.0
2  3.0
3  0.0
4  5.0


In [75]:
df = pd.DataFrame({'A': [1, np.nan, 3, np.nan, 5]})
print(df.interpolate())

     A
0  1.0
1  2.0
2  3.0
3  4.0
4  5.0


In [76]:
df = pd.DataFrame({
    'A': [1, np.nan, 3],
    'B': [4, 5, np.nan]
})

In [77]:
mask = df.isna()
mask

Unnamed: 0,A,B
0,False,False
1,True,False
2,False,True


In [81]:
df_filled = df.mask(mask, 0)
df_filled

Unnamed: 0,A,B
0,1.0,4.0
1,0.0,5.0
2,3.0,0.0
