In [1]:
import pandas as pd
import numpy as np

##  7.1 Handling Missing Data

In [2]:
# for float64 dtype pandas uses NaN for missing values
data = pd.Series([1, 2.4, np.nan, 4.2])
data

0    1.0
1    2.4
2    NaN
3    4.2
dtype: float64

In [3]:
# The isna method gives us a Boolean Series with True where values are null:
data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
# The built-in Python None value is also treated as NA:
s = pd.Series(["Mango", np.nan, "Banana", None, "Avocado"])

In [5]:
s

0      Mango
1        NaN
2     Banana
3       None
4    Avocado
dtype: object

In [6]:
s.isna()

0    False
1     True
2    False
3     True
4    False
dtype: bool

In [7]:
s = pd.Series([1, 2, 3, None], dtype = "float64")

In [8]:
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [9]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

#### 1. Detecting Missing Values: isna() and notna()

In [10]:
data = {
    'A': [1, np.nan, 3, 4, None],
    'B': [np.nan, 2, 3, None, 5],
    'C': ['a', 'b', None, 'd', 'e']
}
df = pd.DataFrame(data)

In [11]:
df

Unnamed: 0,A,B,C
0,1.0,,a
1,,2.0,b
2,3.0,3.0,
3,4.0,,d
4,,5.0,e


In [12]:
# using isna() method to detect missing values
df.isna()

Unnamed: 0,A,B,C
0,False,True,False
1,True,False,False
2,False,False,True
3,False,True,False
4,True,False,False


In [13]:
# using notna() to detect non-missing values
df.notna()

Unnamed: 0,A,B,C
0,True,False,True
1,False,True,True
2,True,True,False
3,True,False,True
4,False,True,True


#### 2. Dropping Missing Data: dropna()

In [14]:
s =  pd.Series([1, np.nan, 3.5, np.nan, 7])

In [15]:
s

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [16]:
#  On a Series, it returns the Series with only the nonnull data and index values
s.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [17]:
# same thing as
s[s.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
data = {
    "A": [1, 2, np.nan, np.nan, 5],
    "B": [np.nan, 2, 3, np.nan, 5],
    "C": [1, np.nan, np.nan, np.nan, 5],
    "D": [1, 2, 3, 4, 5]
}

df = pd.DataFrame(data)

In [19]:
df

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1
1,2.0,2.0,,2
2,,3.0,,3
3,,,,4
4,5.0,5.0,5.0,5


In [20]:
# Drop all rows containing atleast one missing values
df.dropna()

Unnamed: 0,A,B,C,D
4,5.0,5.0,5.0,5


In [21]:
# Drop all columns containing atleast one missing values
df.dropna(axis = 1)

Unnamed: 0,D
0,1
1,2
2,3
3,4
4,5


In [22]:
# drop all rows if all values are missing
df.dropna(how = 'all')   # since there is no row with all missing values no row will be dropped

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1
1,2.0,2.0,,2
2,,3.0,,3
3,,,,4
4,5.0,5.0,5.0,5


In [23]:
# drop all columns if all values are missing
df.dropna(axis = 1, how = 'all') 

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1
1,2.0,2.0,,2
2,,3.0,,3
3,,,,4
4,5.0,5.0,5.0,5


In [24]:
# Drop rows that don't have at least 3 non-missing values
df.dropna(thresh = 3)   # In this case "row 3" and "row 2" is dropped since it has only one non-missing value

Unnamed: 0,A,B,C,D
0,1.0,,1.0,1
1,2.0,2.0,,2
4,5.0,5.0,5.0,5


In [25]:
# Drop columns that don't have at least 3 non-missing values
df.dropna(axis = 1, thresh = 3)  # In this case "column C" is dropped

Unnamed: 0,A,B,D
0,1.0,,1
1,2.0,2.0,2
2,,3.0,3
3,,,4
4,5.0,5.0,5


#### 3. Filling Missing Data: fillna()

In [26]:
# fillna() method on series
s = pd.Series([4.5, np.nan, 3.8, 5.0, np.nan, 4.2, np.nan, 3.9], 
              index=["Alice", "Bob", "Charlie", "David", "Emma", "Frank", "Grace", "Hannah"])

In [27]:
s

Alice      4.5
Bob        NaN
Charlie    3.8
David      5.0
Emma       NaN
Frank      4.2
Grace      NaN
Hannah     3.9
dtype: float64

In [28]:
# fill missing values with number 4
s.fillna(4)

Alice      4.5
Bob        4.0
Charlie    3.8
David      5.0
Emma       4.0
Frank      4.2
Grace      4.0
Hannah     3.9
dtype: float64

In [29]:
# fill missing values with mean of other values
s.fillna(s.mean())

Alice      4.50
Bob        4.28
Charlie    3.80
David      5.00
Emma       4.28
Frank      4.20
Grace      4.28
Hannah     3.90
dtype: float64

In [30]:
# forward fill
s.ffill()

Alice      4.5
Bob        4.5
Charlie    3.8
David      5.0
Emma       5.0
Frank      4.2
Grace      4.2
Hannah     3.9
dtype: float64

In [31]:
# backward fill
s.bfill()

Alice      4.5
Bob        3.8
Charlie    3.8
David      5.0
Emma       4.2
Frank      4.2
Grace      3.9
Hannah     3.9
dtype: float64

In [32]:
data = {
    "Product": ["Laptop", "Phone", "Tablet", "Monitor", "Keyboard", "Mouse", "Headphones"],
    "Price": [1000, 700, np.nan, 300, 50, np.nan, 150],  # Some missing prices
    "Units Sold": [10, np.nan, 5, np.nan, 20, 15, np.nan],  # Some missing sales data
    "Category": ["Electronics", "Electronics", "Electronics", "Electronics", "Accessories", "Accessories", "Accessories"]
}

df = pd.DataFrame(data)

In [33]:
df

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,,Electronics
2,Tablet,,5.0,Electronics
3,Monitor,300.0,,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,,15.0,Accessories
6,Headphones,150.0,,Accessories


In [34]:
# fill missing values with specific values
df.fillna(99)

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,99.0,Electronics
2,Tablet,99.0,5.0,Electronics
3,Monitor,300.0,99.0,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,99.0,15.0,Accessories
6,Headphones,150.0,99.0,Accessories


In [35]:
# fiil missing values with specific value but for each column different value
df.fillna({"Price": 350, "Units Sold": 16})

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,16.0,Electronics
2,Tablet,350.0,5.0,Electronics
3,Monitor,300.0,16.0,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,350.0,15.0,Accessories
6,Headphones,150.0,16.0,Accessories


In [36]:
# fill missing values with their mean
df.fillna(df.mean(numeric_only = True))

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,12.5,Electronics
2,Tablet,440.0,5.0,Electronics
3,Monitor,300.0,12.5,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,440.0,15.0,Accessories
6,Headphones,150.0,12.5,Accessories


In [37]:
# fill a specific columns with their mean value
df['Price'] = df['Price'].fillna(df['Price'].mean())
df

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,,Electronics
2,Tablet,440.0,5.0,Electronics
3,Monitor,300.0,,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,440.0,15.0,Accessories
6,Headphones,150.0,,Accessories


In [38]:
# forward fill
df.ffill()

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,10.0,Electronics
2,Tablet,440.0,5.0,Electronics
3,Monitor,300.0,5.0,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,440.0,15.0,Accessories
6,Headphones,150.0,15.0,Accessories


In [39]:
# Backward fill
df.bfill()

Unnamed: 0,Product,Price,Units Sold,Category
0,Laptop,1000.0,10.0,Electronics
1,Phone,700.0,5.0,Electronics
2,Tablet,440.0,5.0,Electronics
3,Monitor,300.0,20.0,Electronics
4,Keyboard,50.0,20.0,Accessories
5,Mouse,440.0,15.0,Accessories
6,Headphones,150.0,,Accessories


### Examples

In [40]:
data = {
    'order_id': [101, 102, 103, 104, 105, 106],
    'customer_id': [1001, None, 1003, 1004, None, 1006],  # Some missing customer_ids
    'order_amount': [250.50, 300.00, None, 150.00, 400.75, None],  # Some missing order_amounts
    'order_date': [
        '2025-01-01',
        '2025-01-02',
        '2025-01-03',
        '2025-01-04',
        '2025-01-05',
        '2025-01-06'
    ]
}
df = pd.DataFrame(data).set_index('order_id')

In [41]:
df

Unnamed: 0_level_0,customer_id,order_amount,order_date
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,1001.0,250.5,2025-01-01
102,,300.0,2025-01-02
103,1003.0,,2025-01-03
104,1004.0,150.0,2025-01-04
105,,400.75,2025-01-05
106,1006.0,,2025-01-06


In [42]:
# number of missing values in each columns
df.isna().sum()

customer_id     2
order_amount    2
order_date      0
dtype: int64

In [43]:
df['customer_id'].isna() | df['order_amount'].isna()

order_id
101    False
102     True
103     True
104    False
105     True
106     True
dtype: bool

In [44]:
df = df.fillna({'order_amount':df['order_amount'].mean()})
df

Unnamed: 0_level_0,customer_id,order_amount,order_date
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,1001.0,250.5,2025-01-01
102,,300.0,2025-01-02
103,1003.0,275.3125,2025-01-03
104,1004.0,150.0,2025-01-04
105,,400.75,2025-01-05
106,1006.0,275.3125,2025-01-06


In [45]:
df = df.dropna(subset = ['customer_id'])

In [46]:
df

Unnamed: 0_level_0,customer_id,order_amount,order_date
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
101,1001.0,250.5,2025-01-01
103,1003.0,275.3125,2025-01-03
104,1004.0,150.0,2025-01-04
106,1006.0,275.3125,2025-01-06


In [47]:
timestamps = pd.date_range(start='2025-01-01 00:00:00', periods=10, freq='min')

data = {
    'sensor_1': [1.1, 1.2, np.nan, 1.4, 1.5, np.nan, 1.7, 1.8, 1.9, 2.0],
    'sensor_2': [2.1, np.nan, 2.3, 2.4, np.nan, 2.6, 2.7, np.nan, 2.9, 3.0],
    'sensor_3': [np.nan, 3.2, 3.3, np.nan, 3.5, 3.6, np.nan, 3.8, 3.9, 4.0]
}
df = pd.DataFrame(data, index = timestamps)

In [48]:
df

Unnamed: 0,sensor_1,sensor_2,sensor_3
2025-01-01 00:00:00,1.1,2.1,
2025-01-01 00:01:00,1.2,,3.2
2025-01-01 00:02:00,,2.3,3.3
2025-01-01 00:03:00,1.4,2.4,
2025-01-01 00:04:00,1.5,,3.5
2025-01-01 00:05:00,,2.6,3.6
2025-01-01 00:06:00,1.7,2.7,
2025-01-01 00:07:00,1.8,,3.8
2025-01-01 00:08:00,1.9,2.9,3.9
2025-01-01 00:09:00,2.0,3.0,4.0


In [49]:
df.isna().sum()

sensor_1    2
sensor_2    3
sensor_3    3
dtype: int64

In [50]:
df = df.ffill()

In [51]:
df

Unnamed: 0,sensor_1,sensor_2,sensor_3
2025-01-01 00:00:00,1.1,2.1,
2025-01-01 00:01:00,1.2,2.1,3.2
2025-01-01 00:02:00,1.2,2.3,3.3
2025-01-01 00:03:00,1.4,2.4,3.3
2025-01-01 00:04:00,1.5,2.4,3.5
2025-01-01 00:05:00,1.5,2.6,3.6
2025-01-01 00:06:00,1.7,2.7,3.6
2025-01-01 00:07:00,1.8,2.7,3.8
2025-01-01 00:08:00,1.9,2.9,3.9
2025-01-01 00:09:00,2.0,3.0,4.0


In [52]:
df.isna().sum()

sensor_1    0
sensor_2    0
sensor_3    1
dtype: int64

In [53]:
survey_data = {
    'Q1': [5, np.nan, 3, 4, np.nan],
    'Q2': [np.nan, 4, 3, np.nan, np.nan],
    'Q3': [2, 4, np.nan, 5, np.nan],
    'Q4': [np.nan, 3, np.nan, 4, np.nan]
}
survey = pd.DataFrame(survey_data)

In [54]:
survey

Unnamed: 0,Q1,Q2,Q3,Q4
0,5.0,,2.0,
1,,4.0,4.0,3.0
2,3.0,3.0,,
3,4.0,,5.0,4.0
4,,,,


In [55]:
survey.dropna(thresh = len(survey.columns)/2)

Unnamed: 0,Q1,Q2,Q3,Q4
0,5.0,,2.0,
1,,4.0,4.0,3.0
2,3.0,3.0,,
3,4.0,,5.0,4.0


## 7.2 Data Transformation

#### Detecting Duplicates

In [56]:
data = {
    'customer_id': [1, 2, 3, 2, 1, 4],
    'name': ['Alice', 'Bob', 'Charlie', 'Bob', 'Alice', 'David'],
    'email': [
        'alice@example.com', 
        'bob@example.com', 
        'charlie@example.com', 
        'bob@example.com', 
        'alice@example.com', 
        'david@example.com'
    ]
}
df = pd.DataFrame(data)

In [57]:
df

Unnamed: 0,customer_id,name,email
0,1,Alice,alice@example.com
1,2,Bob,bob@example.com
2,3,Charlie,charlie@example.com
3,2,Bob,bob@example.com
4,1,Alice,alice@example.com
5,4,David,david@example.com


In [58]:
# Identify duplicate rows based on all columns (default)
df.duplicated().sum()

np.int64(2)

In [59]:
# Identify duplicates based on 'customer_id' and 'email'
df.duplicated(subset = ['customer_id', 'email'], keep = False)

0     True
1     True
2    False
3     True
4     True
5    False
dtype: bool

####  Removing Duplicates with drop_duplicates()

In [60]:
df

Unnamed: 0,customer_id,name,email
0,1,Alice,alice@example.com
1,2,Bob,bob@example.com
2,3,Charlie,charlie@example.com
3,2,Bob,bob@example.com
4,1,Alice,alice@example.com
5,4,David,david@example.com


In [61]:
df.duplicated().sum()

np.int64(2)

In [62]:
df_unique = df.drop_duplicates()

In [63]:
df_unique

Unnamed: 0,customer_id,name,email
0,1,Alice,alice@example.com
1,2,Bob,bob@example.com
2,3,Charlie,charlie@example.com
5,4,David,david@example.com


In [64]:
orders_data = {
    'order_id': [101, 102, 103, 102, 104, 105, 101],
    'customer_id': [1001, 1002, 1003, 1002, 1004, 1005, 1001],
    'order_amount': [250.50, 300.00, 150.00, 300.00, 400.75, 120.00, 250.50],
    'order_date': [
        '2025-01-01', '2025-01-02', '2025-01-03',
        '2025-01-02', '2025-01-04', '2025-01-05', '2025-01-01'
    ]
}
orders = pd.DataFrame(orders_data)
# Convert order_date to datetime
orders['order_date'] = pd.to_datetime(orders['order_date'])

In [65]:
orders

Unnamed: 0,order_id,customer_id,order_amount,order_date
0,101,1001,250.5,2025-01-01
1,102,1002,300.0,2025-01-02
2,103,1003,150.0,2025-01-03
3,102,1002,300.0,2025-01-02
4,104,1004,400.75,2025-01-04
5,105,1005,120.0,2025-01-05
6,101,1001,250.5,2025-01-01


In [66]:
cleaned_orders = orders.drop_duplicates(subset = ['order_id', 'customer_id'])

In [67]:
cleaned_orders

Unnamed: 0,order_id,customer_id,order_amount,order_date
0,101,1001,250.5,2025-01-01
1,102,1002,300.0,2025-01-02
2,103,1003,150.0,2025-01-03
4,104,1004,400.75,2025-01-04
5,105,1005,120.0,2025-01-05


In [68]:
# removing all duplicated orders
cleaned_orders = orders.drop_duplicates(subset = ['order_id', 'customer_id'], keep = False)

In [69]:
cleaned_orders

Unnamed: 0,order_id,customer_id,order_amount,order_date
2,103,1003,150.0,2025-01-03
4,104,1004,400.75,2025-01-04
5,105,1005,120.0,2025-01-05


####  Transforming Data Using a Function or Mapping

In [70]:
df = pd.DataFrame({'Satisfaction_Score': [1, 3, 5, 2, 4]})

# Define a mapping for satisfaction levels
satisfaction_map = {
    1: 'Very Unsatisfied',
    2: 'Unsatisfied',
    3: 'Neutral',
    4: 'Satisfied',
    5: 'Very Satisfied'
}

In [71]:
df

Unnamed: 0,Satisfaction_Score
0,1
1,3
2,5
3,2
4,4


In [72]:
df['Satisfaction_Level'] = df['Satisfaction_Score'].map(satisfaction_map)

In [73]:
df

Unnamed: 0,Satisfaction_Score,Satisfaction_Level
0,1,Very Unsatisfied
1,3,Neutral
2,5,Very Satisfied
3,2,Unsatisfied
4,4,Satisfied


In [74]:
df = pd.DataFrame({'Sales': [100, 200, 300, 400, 500]})

In [75]:
df

Unnamed: 0,Sales
0,100
1,200
2,300
3,400
4,500


In [76]:
# let's add the column for logarthimic value for sales
df['Log_Sales'] = df['Sales'].apply(np.log)

In [77]:
df

Unnamed: 0,Sales,Log_Sales
0,100,4.60517
1,200,5.298317
2,300,5.703782
3,400,5.991465
4,500,6.214608


In [78]:
df = pd.DataFrame({'Name': ['  ALICE ', 'bob', '  Charlie  ']})

In [79]:
df

Unnamed: 0,Name
0,ALICE
1,bob
2,Charlie


In [80]:
df['Name'] = df['Name'].apply(lambda x: x.strip().title())

In [81]:
df

Unnamed: 0,Name
0,Alice
1,Bob
2,Charlie


####  Replacing Values

In [82]:
# Replacing a Single Value
df = pd.DataFrame({
    'Question1': [5, 'N/A', 3, 4],
    'Question2': [2, 3, 'N/A', 5]
  })

In [83]:
df

Unnamed: 0,Question1,Question2
0,5.0,2.0
1,,3.0
2,3.0,
3,4.0,5.0


In [84]:
df.replace("N/A", np.nan)

  df.replace("N/A", np.nan)


Unnamed: 0,Question1,Question2
0,5.0,2.0
1,,3.0
2,3.0,
3,4.0,5.0


In [85]:
# replacing single vallues
s = pd.Series([1.2, 1.4, 3.2, 100])

In [86]:
s

0      1.2
1      1.4
2      3.2
3    100.0
dtype: float64

In [87]:
s.replace(100, np.nan)

0    1.2
1    1.4
2    3.2
3    NaN
dtype: float64

In [88]:
df = pd.DataFrame({
    'Temperature': [23, -1, 25, 999, 20],
    'Humidity': [45, 999, 55, -1, 50]
})

In [89]:
df

Unnamed: 0,Temperature,Humidity
0,23,45
1,-1,999
2,25,55
3,999,-1
4,20,50


In [90]:
# let's replace -1 and 999 with a single value NaN
df.replace([-1, 999], np.nan)

Unnamed: 0,Temperature,Humidity
0,23.0,45.0
1,,
2,25.0,55.0
3,,
4,20.0,50.0


In [91]:
df

Unnamed: 0,Temperature,Humidity
0,23,45
1,-1,999
2,25,55
3,999,-1
4,20,50


In [92]:
# let's replace -1 with 31 and 999 with 26
df.replace([-1, 999], [31, 26])

Unnamed: 0,Temperature,Humidity
0,23,45
1,31,26
2,25,55
3,26,31
4,20,50


In [93]:
df = pd.DataFrame({
    'Status': [1, 2, 3, 2, 1],
    'Comments': ['ok', 'error', 'ok', 'unknown', 'error']
})

In [94]:
df

Unnamed: 0,Status,Comments
0,1,ok
1,2,error
2,3,ok
3,2,unknown
4,1,error


In [95]:
# let's replace the status with friendly message
stat = {1: 'New', 2: 'In progress', 3: 'Completed'}
df['Status'] = df['Status'].replace(stat)

In [96]:
df

Unnamed: 0,Status,Comments
0,New,ok
1,In progress,error
2,Completed,ok
3,In progress,unknown
4,New,error


In [97]:
# let's replace 'error' and 'unknown' on comments column with different message
comm = {'error': 'issue', 'unknown': 'pending'}
df = df.replace(comm)

In [98]:
df

Unnamed: 0,Status,Comments
0,New,ok
1,In progress,issue
2,Completed,ok
3,In progress,pending
4,New,issue


In [99]:
df = pd.DataFrame({
    'Description': ['Item123', 'Test456', 'Sample789', 'NoNumbers']
})

In [100]:
df

Unnamed: 0,Description
0,Item123
1,Test456
2,Sample789
3,NoNumbers


In [101]:
df = df.replace(r'\d+', '', regex = True)

In [102]:
df

Unnamed: 0,Description
0,Item
1,Test
2,Sample
3,NoNumbers


####  Renaming Axis Indexes

In [103]:
data = {
    "A": [10, 20, 30],
    "B": [40, 50, 60],
    "C": [70, 80, 90]
}

df = pd.DataFrame(data, index=["one", "two", "three"])

In [104]:
df

Unnamed: 0,A,B,C
one,10,40,70
two,20,50,80
three,30,60,90


In [105]:
def fun(x):
    return x.lower()

In [106]:
# lets apply the function to the columns lables
df.columns = df.columns.map(fun)

In [107]:
df

Unnamed: 0,a,b,c
one,10,40,70
two,20,50,80
three,30,60,90


In [108]:
df.rename(index = str.title, columns = str.upper)

Unnamed: 0,A,B,C
One,10,40,70
Two,20,50,80
Three,30,60,90


In [109]:
data = {
    'A': [10, 20, 30],
    'B': [40, 50, 60]
 }
df = pd.DataFrame(data, index=['r1', 'r2', 'r3'])

In [110]:
df

Unnamed: 0,A,B
r1,10,40
r2,20,50
r3,30,60


In [111]:
# renaming rows of the dataframe
df = df.rename(index = {'r1': 'row 1', 'r2': 'row 2', 'r3': 'row 3'})

In [112]:
df

Unnamed: 0,A,B
row 1,10,40
row 2,20,50
row 3,30,60


In [113]:
# renaming columns
df = df.rename(columns = {'A': 'Alpha', 'B': 'Beta'})

In [114]:
df

Unnamed: 0,Alpha,Beta
row 1,10,40
row 2,20,50
row 3,30,60


In [115]:
# applying functions
def rt(x):
    return x.title()

In [116]:
df = df.rename(index = rt)

In [117]:
df

Unnamed: 0,Alpha,Beta
Row 1,10,40
Row 2,20,50
Row 3,30,60


In [118]:
# using lambda function to transform columns
df = df.rename(columns = lambda c: 'col_' + c)

In [119]:
df

Unnamed: 0,col_Alpha,col_Beta
Row 1,10,40
Row 2,20,50
Row 3,30,60


In [120]:
df = pd.DataFrame({
    'X': [1, 2, 3],
    'Y': [4, 5, 6]
}, index=['a', 'b', 'c'])

In [121]:
df

Unnamed: 0,X,Y
a,1,4
b,2,5
c,3,6


In [122]:
df.index = ['alpha', 'beta', 'gamma']

In [123]:
df.columns = ['John', 'Alice']

In [124]:
df

Unnamed: 0,John,Alice
alpha,1,4
beta,2,5
gamma,3,6


In [125]:
# Using set_axis to change the columns
df = pd.DataFrame({
    'Col1': [100, 200],
    'Col2': [300, 400]
})

In [126]:
df

Unnamed: 0,Col1,Col2
0,100,300
1,200,400


In [127]:
# change name of index lables
df = df.set_axis(['Ron', 'Raya'])
df

Unnamed: 0,Col1,Col2
Ron,100,300
Raya,200,400


In [128]:
df = df.set_axis(['Week 1', 'Week 2'], axis = 1)
df

Unnamed: 0,Week 1,Week 2
Ron,100,300
Raya,200,400


In [129]:
df = pd.DataFrame({
    'Sales': [1000, 2000, 1500],
    'Profit': [100, 200, 150]
}, index=['Jan', 'Feb', 'Mar'])

# Name the axes
df.index.name = 'Month'
df.columns.name = 'Metrics'

In [130]:
df

Metrics,Sales,Profit
Month,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,1000,100
Feb,2000,200
Mar,1500,150


In [131]:
df = df.rename_axis(index = 'Period', columns = 'Fanancials')
df

Fanancials,Sales,Profit
Period,Unnamed: 1_level_1,Unnamed: 2_level_1
Jan,1000,100
Feb,2000,200
Mar,1500,150


#### Discretization and Binning

####  1. Discretization with pd.cut()

In [132]:
data = [1, 7, 5, 4, 6, 8, 10, 15, 20]

In [133]:
binned = pd.cut(data, bins = 3)

In [134]:
binned

[(0.981, 7.333], (0.981, 7.333], (0.981, 7.333], (0.981, 7.333], (0.981, 7.333], (7.333, 13.667], (7.333, 13.667], (13.667, 20.0], (13.667, 20.0]]
Categories (3, interval[float64, right]): [(0.981, 7.333] < (7.333, 13.667] < (13.667, 20.0]]

In [135]:
binned.categories

IntervalIndex([(0.981, 7.333], (7.333, 13.667], (13.667, 20.0]], dtype='interval[float64, right]')

In [136]:
bins = [0, 5, 10, 15, 20]  
labels = ['Low', 'Medium', 'High', 'Very High']

binned = pd.cut(data, bins=bins, labels=labels)

In [137]:
binned

['Low', 'Medium', 'Low', 'Low', 'Medium', 'Medium', 'Medium', 'High', 'Very High']
Categories (4, object): ['Low' < 'Medium' < 'High' < 'Very High']

In [138]:
ages = [15, 22, 27, 35, 42, 57, 63, 70, 18, 29, 31, 45, 50, 68, 80]

In [139]:
bins = [0, 18, 35, 60, np.inf]

In [140]:
age_categories = pd.cut(ages, bins = bins)

In [141]:
age_categories

[(0.0, 18.0], (18.0, 35.0], (18.0, 35.0], (18.0, 35.0], (35.0, 60.0], ..., (18.0, 35.0], (35.0, 60.0], (35.0, 60.0], (60.0, inf], (60.0, inf]]
Length: 15
Categories (4, interval[float64, right]): [(0.0, 18.0] < (18.0, 35.0] < (35.0, 60.0] < (60.0, inf]]

In [142]:
pd.Series.value_counts(age_categories)

(18.0, 35.0]    5
(35.0, 60.0]    4
(60.0, inf]     4
(0.0, 18.0]     2
Name: count, dtype: int64

In [143]:
# let's change the right inclusion
pd.cut(ages, bins, right=False)

[[0.0, 18.0), [18.0, 35.0), [18.0, 35.0), [35.0, 60.0), [35.0, 60.0), ..., [18.0, 35.0), [35.0, 60.0), [35.0, 60.0), [60.0, inf), [60.0, inf)]
Length: 15
Categories (4, interval[float64, left]): [[0.0, 18.0) < [18.0, 35.0) < [35.0, 60.0) < [60.0, inf)]

In [144]:
groups = ['Child', 'Young Adult', 'Adult', 'Senior']
age_catagory = pd.cut(ages, bins, labels = groups)

In [145]:
age_catagory

['Child', 'Young Adult', 'Young Adult', 'Young Adult', 'Adult', ..., 'Young Adult', 'Adult', 'Adult', 'Senior', 'Senior']
Length: 15
Categories (4, object): ['Child' < 'Young Adult' < 'Adult' < 'Senior']

In [146]:
pd.Series.value_counts(age_catagory)

Young Adult    5
Adult          4
Senior         4
Child          2
Name: count, dtype: int64

#### 2. Quantile-Based Discretization with pd.qcut()

In [147]:
data = [1, 7, 5, 4, 6, 8, 10, 15, 20]

In [148]:
pd.qcut(data, 4)

[(0.999, 5.0], (5.0, 7.0], (0.999, 5.0], (0.999, 5.0], (5.0, 7.0], (7.0, 10.0], (7.0, 10.0], (10.0, 20.0], (10.0, 20.0]]
Categories (4, interval[float64, right]): [(0.999, 5.0] < (5.0, 7.0] < (7.0, 10.0] < (10.0, 20.0]]

In [149]:
bins = [0, 5, 10, 15, 20]  
labels = ['Low', 'Medium', 'High', 'Very High']

In [150]:
pd.qcut(data, q=3, labels=['Low', 'Medium', 'High'])

['Low', 'Medium', 'Low', 'Low', 'Medium', 'Medium', 'High', 'High', 'High']
Categories (3, object): ['Low' < 'Medium' < 'High']

In [151]:
binned, bins = pd.qcut(data, q=4, retbins=True)

In [152]:
binned

[(0.999, 5.0], (5.0, 7.0], (0.999, 5.0], (0.999, 5.0], (5.0, 7.0], (7.0, 10.0], (7.0, 10.0], (10.0, 20.0], (10.0, 20.0]]
Categories (4, interval[float64, right]): [(0.999, 5.0] < (5.0, 7.0] < (7.0, 10.0] < (10.0, 20.0]]

In [153]:
bins

array([ 1.,  5.,  7., 10., 20.])

### Examples

In [154]:
np.random.seed(42)
df = pd.DataFrame({'Income': np.random.normal(50000, 15000, 1000)})

In [155]:
df.head()

Unnamed: 0,Income
0,57450.712295
1,47926.035482
2,59715.328072
3,72845.447846
4,46487.699379


In [156]:
# Define the bins and corresponding labels
bins = [0, 40000, 60000, 80000, np.inf]
labels = ['Low', 'Average', 'High', 'Very High']

In [157]:
# Categorize the Income column using pd.cut()
df['Income Level'] = pd.cut(df['Income'], bins = bins, labels = labels)

In [158]:
df.head()

Unnamed: 0,Income,Income Level
0,57450.712295,Average
1,47926.035482,Average
2,59715.328072,Average
3,72845.447846,High
4,46487.699379,Average


In [159]:
# Check the distribution of income levels
df['Income Level'].value_counts()

Income Level
Average      514
Low          243
High         219
Very High     24
Name: count, dtype: int64

In [160]:
df['Income Quantile'] = pd.qcut(df['Income'], 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])

In [161]:
df.head(10)

Unnamed: 0,Income,Income Level,Income Quantile
0,57450.712295,Average,Q3
1,47926.035482,Average,Q2
2,59715.328072,Average,Q3
3,72845.447846,High,Q4
4,46487.699379,Average,Q2
5,46487.945646,Average,Q2
6,73688.192233,High,Q4
7,61511.520937,High,Q4
8,42957.884211,Average,Q2
9,58138.400654,Average,Q3


In [162]:
df['Income Quantile'].value_counts()

Income Quantile
Q1    250
Q2    250
Q3    250
Q4    250
Name: count, dtype: int64

## Example: Age Grouping with pd.cut()

In [163]:
data = {'Age': np.random.randint(1, 100, size = 1000)}
df = pd.DataFrame(data)

In [164]:
df.head()

Unnamed: 0,Age
0,44
1,49
2,75
3,61
4,67


In [165]:
bins = [0, 12, 19, 64, 100]
labels = ['Child', 'Teenager', 'Adult', 'Senior']

In [166]:
df['Stage'] = pd.cut(df['Age'], bins = bins, labels = labels)

In [167]:
df.head(10)

Unnamed: 0,Age,Stage
0,44,Adult
1,49,Adult
2,75,Senior
3,61,Adult
4,67,Senior
5,6,Child
6,9,Child
7,6,Child
8,73,Senior
9,95,Senior


In [168]:
df['Stage'].value_counts()

Stage
Adult       453
Senior      333
Child       139
Teenager     75
Name: count, dtype: int64

## Example: Student Exam Scores with pd.qcut()

In [169]:
np.random.seed(112)
data = {'Score': np.round(np.random.uniform(50, 100, 50))}
df = pd.DataFrame(data)

In [170]:
df.head()

Unnamed: 0,Score
0,69.0
1,82.0
2,98.0
3,54.0
4,89.0


In [171]:
df['Score Quartile'] = pd.qcut(df['Score'], 4, ['Q1', 'Q2', 'Q3', 'Q4'])

In [172]:
df.head(10)

Unnamed: 0,Score,Score Quartile
0,69.0,Q2
1,82.0,Q3
2,98.0,Q4
3,54.0,Q1
4,89.0,Q4
5,92.0,Q4
6,53.0,Q1
7,91.0,Q4
8,94.0,Q4
9,86.0,Q3


In [173]:
df['Score Quartile'].value_counts()

Score Quartile
Q2    14
Q1    13
Q4    13
Q3    10
Name: count, dtype: int64

## Example: Housing Price Segmentation with Both pd.cut() and pd.qcut()

In [174]:
np.random.seed(53)
prices = np.random.uniform(3000000, 40000000, 200)

# Ensure no negative prices by setting a minimum value (e.g., $50,000)
prices = np.clip(prices, 50000, None)
df = pd.DataFrame({'Price': prices})

In [175]:
df.head()

Unnamed: 0,Price
0,34326510.0
1,23763120.0
2,19830390.0
3,16030470.0
4,24676500.0


In [176]:
price_bins = [0, 8000000, 15000000, np.inf]
price_labels = ['Budget', 'Mid-range', 'Luxury']

In [177]:
df['Price Category'] = pd.cut(df['Price'], bins = price_bins, labels = price_labels)

In [178]:
df.head(10)

Unnamed: 0,Price,Price Category
0,34326510.0,Luxury
1,23763120.0,Luxury
2,19830390.0,Luxury
3,16030470.0,Luxury
4,24676500.0,Luxury
5,22822740.0,Luxury
6,33615880.0,Luxury
7,12788060.0,Mid-range
8,20700030.0,Luxury
9,15868050.0,Luxury


In [179]:
df['Price Category'].value_counts()

Price Category
Luxury       147
Mid-range     30
Budget        23
Name: count, dtype: int64

In [180]:
df['Price Quartile'] = pd.qcut(df['Price'], 4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

In [181]:
df.head(10)

Unnamed: 0,Price,Price Category,Price Quartile
0,34326510.0,Luxury,Q4
1,23763120.0,Luxury,Q3
2,19830390.0,Luxury,Q2
3,16030470.0,Luxury,Q2
4,24676500.0,Luxury,Q3
5,22822740.0,Luxury,Q2
6,33615880.0,Luxury,Q4
7,12788060.0,Mid-range,Q1
8,20700030.0,Luxury,Q2
9,15868050.0,Luxury,Q2


In [182]:
df['Price Quartile'].value_counts()

Price Quartile
Q1    50
Q2    50
Q3    50
Q4    50
Name: count, dtype: int64

####  Detecting and Filtering Outliers

In [183]:
df = pd.DataFrame(np.random.standard_normal((1000, 4)))

In [184]:
df.head()

Unnamed: 0,0,1,2,3
0,0.222817,0.190511,1.771046,0.563271
1,-0.066081,-0.01189,0.485773,0.197435
2,0.531431,0.327612,-0.743776,-1.716714
3,0.547507,-0.784494,0.174774,-0.373695
4,0.951607,-1.366329,-1.184671,-0.967506


In [185]:
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.006817,-0.048355,-0.042604,0.023903
std,1.000081,0.991533,0.969148,0.961629
min,-3.522098,-3.24587,-3.828995,-3.286579
25%,-0.686055,-0.70078,-0.697313,-0.632765
50%,0.009727,-0.028887,-0.015472,0.072305
75%,0.689827,0.617394,0.603155,0.667354
max,3.155768,3.763275,3.570555,3.379262


In [186]:
# let's check absolute value of values greater than 3 from column 1
df[1][df[1].abs() > 3]

80    -3.112635
135   -3.245870
144    3.353238
582    3.763275
639    3.418813
Name: 1, dtype: float64

In [187]:
# To select all rows having a value exceeding 3 or –3, we can use the any method on a Boolean DataFrame:
df[(df.abs() > 3).any(axis = 'columns')]

Unnamed: 0,0,1,2,3
40,3.155768,0.757217,1.369375,1.455905
78,0.286802,-1.368267,0.475424,3.379262
80,0.904455,-3.112635,-0.177241,-1.137504
135,0.096213,-3.24587,-0.433385,-0.637028
138,-0.567387,-0.983145,3.570555,0.996056
144,-0.665892,3.353238,-0.659654,-1.739872
207,-3.522098,-1.921622,-1.284414,1.200885
582,-0.822313,3.763275,-0.389612,1.247466
639,0.343575,3.418813,-0.271494,0.328045
753,-0.029423,0.085052,-3.828995,1.141235


In [188]:
# np.sign(data) produces 1 and –1 values based on whether the values in data are positive or negative:
# for example
np.sign(df.head())

Unnamed: 0,0,1,2,3
0,1.0,1.0,1.0,1.0
1,-1.0,-1.0,1.0,1.0
2,1.0,1.0,-1.0,-1.0
3,1.0,-1.0,1.0,-1.0
4,1.0,-1.0,-1.0,-1.0


In [189]:
# so let's filter the outliers by df.abs() maximum is 3
df[df.abs() > 3] = np.sign(df) * 3

In [190]:
# now let's cjheck if there is any outliers
df.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.00645,-0.049532,-0.042345,0.024043
std,0.997897,0.985236,0.964288,0.958669
min,-3.0,-3.0,-3.0,-3.0
25%,-0.686055,-0.70078,-0.697313,-0.632765
50%,0.009727,-0.028887,-0.015472,0.072305
75%,0.689827,0.617394,0.603155,0.667354
max,3.0,3.0,3.0,3.0


In [191]:
data = {
    'Sensor_Reading': [10, 12, 11, 13, 10, 500, 11, 12, 13, 9, 10, 10, 10, 600]
}
df = pd.DataFrame(data)

In [192]:
df

Unnamed: 0,Sensor_Reading
0,10
1,12
2,11
3,13
4,10
5,500
6,11
7,12
8,13
9,9


In [193]:
q1 = df['Sensor_Reading'].quantile(0.25)
q3 = df['Sensor_Reading'].quantile(0.75)
iqr = q3 - q1

In [194]:
lb = q1 - 1.5 * iqr
ub = q3 + 1.5 * iqr

In [195]:
lb

np.float64(5.875)

In [196]:
ub

np.float64(16.875)

In [197]:
df['Is Outlier'] = (df['Sensor_Reading'] < lb )| (df['Sensor_Reading'] > ub)

In [198]:
df

Unnamed: 0,Sensor_Reading,Is Outlier
0,10,False
1,12,False
2,11,False
3,13,False
4,10,False
5,500,True
6,11,False
7,12,False
8,13,False
9,9,False


In [199]:
# total number of outliers
df['Is Outlier'].sum()

np.int64(2)

#### Permutation and Random Sampling

In [200]:
df = pd.DataFrame({
    'ID': [1, 2, 3, 4, 5],
    'Value': ['A', 'B', 'C', 'D', 'E']
})

In [201]:
df

Unnamed: 0,ID,Value
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E


In [202]:
# Setting frac=1 tells Pandas to return 100% of the rows, but in a randomized order.
# we can use he random_state argument for reproducibility 
df_shuffled = df.sample(frac = 1, random_state = 42)

In [203]:
df_shuffled

Unnamed: 0,ID,Value
1,2,B
4,5,E
2,3,C
0,1,A
3,4,D


In [204]:
df

Unnamed: 0,ID,Value
0,1,A
1,2,B
2,3,C
3,4,D
4,5,E


In [205]:
new_index = np.random.permutation(df.index)

In [206]:
new_index

array([2, 1, 4, 3, 0])

In [207]:
df_new = df.loc[new_index]

In [208]:
df_new

Unnamed: 0,ID,Value
2,3,C
1,2,B
4,5,E
3,4,D
0,1,A


In [209]:
# we can also use take() method
df.take(new_index)

Unnamed: 0,ID,Value
2,3,C
1,2,B
4,5,E
3,4,D
0,1,A


In [210]:
# we can shuffle columns
data = {
    'A': [5, 9, 2, 8, 7, 6],
    'B': [3, 6, 8, 1, 5, 2],
    'C': [4, 7, 3, 9, 6, 5],
    'D': [9, 2, 5, 7, 4, 3],
    'E': [1, 6, 7, 3, 8, 9],
    'F': [8, 3, 4, 2, 6, 7]
}

df = pd.DataFrame(data)

In [211]:
df

Unnamed: 0,A,B,C,D,E,F
0,5,3,4,9,1,8
1,9,6,7,2,6,3
2,2,8,3,5,7,4
3,8,1,9,7,3,2
4,7,5,6,4,8,6
5,6,2,5,3,9,7


In [212]:
# permutating rows
shuffled_row = np.random.permutation(df.index)

In [213]:
df_row_shuffled = df.loc[shuffled_row]

In [214]:
df_row_shuffled

Unnamed: 0,A,B,C,D,E,F
1,9,6,7,2,6,3
3,8,1,9,7,3,2
2,2,8,3,5,7,4
5,6,2,5,3,9,7
4,7,5,6,4,8,6
0,5,3,4,9,1,8


In [215]:
# shuffling columns labels
shuffled_columns = np.random.permutation(df.columns)

In [216]:
df_shuffled_columns = df[shuffled_columns]

In [217]:
df_shuffled_columns

Unnamed: 0,A,F,C,D,E,B
0,5,8,4,9,1,3
1,9,3,7,2,6,6
2,2,4,3,5,7,8
3,8,2,9,7,3,1
4,7,6,6,4,8,5
5,6,7,5,3,9,2


In [218]:
# shuffling both
df_shuffled = df[shuffled_columns].loc[shuffled_row]

In [219]:
df_shuffled

Unnamed: 0,A,F,C,D,E,B
1,9,3,7,2,6,6
3,8,2,9,7,3,1
2,2,4,3,5,7,8
5,6,7,5,3,9,2
4,7,6,6,4,8,5
0,5,8,4,9,1,3


In [220]:
# Sampling Without Replacement
df

Unnamed: 0,A,B,C,D,E,F
0,5,3,4,9,1,8
1,9,6,7,2,6,3
2,2,8,3,5,7,4
3,8,1,9,7,3,2
4,7,5,6,4,8,6
5,6,2,5,3,9,7


In [221]:
df.sample(n=3, random_state = 42)

Unnamed: 0,A,B,C,D,E,F
0,5,3,4,9,1,8
1,9,6,7,2,6,3
5,6,2,5,3,9,7


In [222]:
# Sampling with replacement
df.sample(n=10, replace = True, random_state = 42)

Unnamed: 0,A,B,C,D,E,F
3,8,1,9,7,3,2
4,7,5,6,4,8,6
2,2,8,3,5,7,4
4,7,5,6,4,8,6
4,7,5,6,4,8,6
1,9,6,7,2,6,3
2,2,8,3,5,7,4
2,2,8,3,5,7,4
2,2,8,3,5,7,4
4,7,5,6,4,8,6


# Example

In [223]:
data = {
    'EmployeeID': range(1, 21),
    'Name': [
        'Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Grace', 'Helen',
        'Ian', 'Jane', 'Kyle', 'Laura', 'Mike', 'Nina', 'Oscar', 'Pam',
        'Quincy', 'Rachel', 'Steve', 'Tina'
    ],
    'Age': [25, 30, 28, 35, 22, 40, 33, 27, 31, 29, 38, 26, 34, 32, 45, 37, 29, 31, 36, 28],
    'Department': [
        'Sales', 'IT', 'HR', 'IT', 'Marketing', 'Sales', 'HR', 'Marketing',
        'IT', 'Sales', 'HR', 'IT', 'Marketing', 'Sales', 'HR', 'IT', 'Marketing', 'Sales', 'HR', 'IT'
    ],
    'Salary': [50000, 60000, 55000, 62000, 48000, 65000, 57000, 50000, 61000, 53000,
               56000, 60000, 51000, 52000, 58000, 63000, 49000, 54000, 58000, 60000]
}

df = pd.DataFrame(data)

In [224]:
df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
0,1,Alice,25,Sales,50000
1,2,Bob,30,IT,60000
2,3,Charlie,28,HR,55000
3,4,David,35,IT,62000
4,5,Eva,22,Marketing,48000
5,6,Frank,40,Sales,65000
6,7,Grace,33,HR,57000
7,8,Helen,27,Marketing,50000
8,9,Ian,31,IT,61000
9,10,Jane,29,Sales,53000


In [225]:
df_shuffled = df.sample(frac = 1, random_state = 42)

In [226]:
df_shuffled

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
0,1,Alice,25,Sales,50000
17,18,Rachel,31,Sales,54000
15,16,Pam,37,IT,63000
1,2,Bob,30,IT,60000
8,9,Ian,31,IT,61000
5,6,Frank,40,Sales,65000
11,12,Laura,26,IT,60000
3,4,David,35,IT,62000
18,19,Steve,36,HR,58000
16,17,Quincy,29,Marketing,49000


In [227]:
sample_df = df.sample(n = 5, random_state = 123)

In [228]:
sample_df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
14,15,Oscar,45,HR,58000
5,6,Frank,40,Sales,65000
4,5,Eva,22,Marketing,48000
17,18,Rachel,31,Sales,54000
8,9,Ian,31,IT,61000


In [229]:
train_df = df.sample(frac = 0.8, random_state = 75)

In [230]:
test_df = df.drop(train_df.index)

In [231]:
train_df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
18,19,Steve,36,HR,58000
11,12,Laura,26,IT,60000
13,14,Nina,32,Sales,52000
4,5,Eva,22,Marketing,48000
12,13,Mike,34,Marketing,51000
9,10,Jane,29,Sales,53000
5,6,Frank,40,Sales,65000
3,4,David,35,IT,62000
14,15,Oscar,45,HR,58000
15,16,Pam,37,IT,63000


In [232]:
test_df

Unnamed: 0,EmployeeID,Name,Age,Department,Salary
0,1,Alice,25,Sales,50000
8,9,Ian,31,IT,61000
16,17,Quincy,29,Marketing,49000
17,18,Rachel,31,Sales,54000


#### Computing Indicator/Dummy Variables

In [233]:
data = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Department': ['Sales', 'IT', 'HR', 'IT', 'Sales']
}
df = pd.DataFrame(data)

In [234]:
df

Unnamed: 0,EmployeeID,Department
0,1,Sales
1,2,IT
2,3,HR
3,4,IT
4,5,Sales


In [235]:
# To create sole dummy variable from department
pd.get_dummies(df['Department'])

Unnamed: 0,HR,IT,Sales
0,False,False,True
1,False,True,False
2,True,False,False
3,False,True,False
4,False,False,True


In [236]:
# create dummy variables for the Department column:
dummies = pd.get_dummies(df, columns = ['Department'])

In [237]:
dummies

Unnamed: 0,EmployeeID,Department_HR,Department_IT,Department_Sales
0,1,False,False,True
1,2,False,True,False
2,3,True,False,False
3,4,False,True,False
4,5,False,False,True


In [238]:
pd.get_dummies(df, columns=['Department'], drop_first=True)

Unnamed: 0,EmployeeID,Department_IT,Department_Sales
0,1,False,True
1,2,True,False
2,3,False,False
3,4,True,False
4,5,False,True


In [239]:
data_with_nan = {
    'EmployeeID': [1, 2, 3, 4, 5],
    'Department': ['Sales', 'IT', None, 'IT', 'Sales']
}
dfn = pd.DataFrame(data_with_nan)

In [240]:
dfn

Unnamed: 0,EmployeeID,Department
0,1,Sales
1,2,IT
2,3,
3,4,IT
4,5,Sales


In [241]:
dummies = pd.get_dummies(dfn, columns = ['Department'])

In [242]:
dummies

Unnamed: 0,EmployeeID,Department_IT,Department_Sales
0,1,False,True
1,2,True,False
2,3,False,False
3,4,True,False
4,5,False,True


In [243]:
# In the above case it did not create a dummy nariable for 'NaN' value so if we want to crate
# we can add "dummy_na = True" argument
dummies = pd.get_dummies(dfn, columns = ['Department'], dummy_na = True)

In [244]:
dummies

Unnamed: 0,EmployeeID,Department_IT,Department_Sales,Department_nan
0,1,False,True,False
1,2,True,False,False
2,3,False,False,True
3,4,True,False,False
4,5,False,True,False


In [245]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Edward', 'Fiona',
             'George', 'Hannah', 'Ian', 'Julia', 'Kevin', 'Laura'],
    'Age': [12, 15, 20, 24, 30, 35, 40, 28, 50, 18, 22, 45]
}

# Create the DataFrame
df = pd.DataFrame(data)

In [246]:
df

Unnamed: 0,Name,Age
0,Alice,12
1,Bob,15
2,Charlie,20
3,Diana,24
4,Edward,30
5,Fiona,35
6,George,40
7,Hannah,28
8,Ian,50
9,Julia,18


In [247]:
bins = [11, 25, 35, 50]

In [248]:
# combining dummies and cut
pd.get_dummies(pd.cut(df['Age'], bins))

Unnamed: 0,"(11, 25]","(25, 35]","(35, 50]"
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,False,True,False
5,False,True,False
6,False,False,True
7,False,True,False
8,False,False,True
9,True,False,False


# 7.3 Extension Data Types

In [249]:
s = pd.Series([1, 2, 3, None])

In [250]:
s

0    1.0
1    2.0
2    3.0
3    NaN
dtype: float64

In [251]:
# by introducing the extension data type we can preserve the data type of the values
s_new = pd.Series([1, 2, 3, None], dtype = "Int64")

In [252]:
s_new

0       1
1       2
2       3
3    <NA>
dtype: Int64

In [253]:
s.isna()

0    False
1    False
2    False
3     True
dtype: bool

In [254]:
s[3]

np.float64(nan)

In [255]:
s = pd.Series([True, False, True, None])

In [256]:
s

0     True
1    False
2     True
3     None
dtype: object

In [257]:
s_bool = pd.Series([True, False, True, None], dtype = 'boolean')

In [258]:
s_bool

0     True
1    False
2     True
3     <NA>
dtype: boolean

In [259]:
df_str = pd.DataFrame({'Fruit': ['Apple', None, 'Banana', 'Cherry']})

In [260]:
df_str['Fruit'].dtype

dtype('O')

In [261]:
df_new = pd.DataFrame({'Fruit': ['Apple', None, 'Banana', 'Cherry']}, dtype = "string")

In [262]:
df_new['Fruit'].dtype

string[python]

In [263]:
df = pd.DataFrame({'I': [1, 2, 3, None, 4],
                  'S': ['one', 'two', None, 'four', 'five'],
                  'B': [True, None, False, False, True]})

In [264]:
df

Unnamed: 0,I,S,B
0,1.0,one,True
1,2.0,two,
2,3.0,,False
3,,four,False
4,4.0,five,True


In [265]:
print('BEFORE\n')
print(df['I'].dtype)
print(df['S'].dtype)
print(df['B'].dtype)

BEFORE

float64
object
object


In [266]:
df['I'] = df['I'].astype('Int64')
df['S'] = df['S'].astype('string')
df['B'] = df['B'].astype('boolean')

In [267]:
df

Unnamed: 0,I,S,B
0,1.0,one,True
1,2.0,two,
2,3.0,,False
3,,four,False
4,4.0,five,True


In [268]:
print('AFTER\n')
print(df['I'].dtype)
print(df['S'].dtype)
print(df['B'].dtype)

AFTER

Int64
string
boolean


#  7.4 String Manipulation

#### Python Built-In String Object Methods

In [269]:
s = "Hello, World! THIS IS a test."

In [270]:
print(s)

Hello, World! THIS IS a test.


In [271]:
print(s.lower())

hello, world! this is a test.


In [272]:
print(s.upper())

HELLO, WORLD! THIS IS A TEST.


In [273]:
print(s.capitalize())

Hello, world! this is a test.


In [274]:
print(s.title())

Hello, World! This Is A Test.


In [275]:
line = '   Python is fun!     '

In [276]:
print(line)

   Python is fun!     


In [277]:
line.strip()

'Python is fun!'

In [278]:
line.rstrip()

'   Python is fun!'

In [279]:
line.lstrip()

'Python is fun!     '

In [280]:
line = "Hello, World! Hello, Python!"

In [281]:
line

'Hello, World! Hello, Python!'

In [282]:
line.find('World')

7

In [283]:
line.rfind('Hello')

14

In [284]:
line.index(',')

5

In [285]:
line.count(',')

2

In [286]:
line.count('Hello')

2

In [287]:
line.count('Python')

1

In [288]:
line.startswith('Hello')

True

In [289]:
line.endswith('Hello')

False

In [290]:
st = "I like apples. apples are my favorite fruit."

In [291]:
st.replace('apples', 'bananas')

'I like bananas. bananas are my favorite fruit.'

In [292]:
fact = "Bananas are berries, but strawberries aren’t!"

In [293]:
fact

'Bananas are berries, but strawberries aren’t!'

In [294]:
words = fact.split()

In [295]:
words

['Bananas', 'are', 'berries,', 'but', 'strawberries', 'aren’t!']

In [296]:
':'.join(words)

'Bananas:are:berries,:but:strawberries:aren’t!'

In [297]:
sentence = "Sloths are slow. Turtles can breathe through their butts. Wombats poop cubes."

In [298]:
sentence

'Sloths are slow. Turtles can breathe through their butts. Wombats poop cubes.'

In [299]:
facts = sentence.split('.')

In [300]:
facts

['Sloths are slow',
 ' Turtles can breathe through their butts',
 ' Wombats poop cubes',
 '']

In [301]:
' |'.join(facts)

'Sloths are slow | Turtles can breathe through their butts | Wombats poop cubes |'

In [302]:
funny_facts = """Turtles can breathe through their butts
Sloths can hold their breath longer than dolphins
Wombat poop is cube-shaped
Bananas are berries, but strawberries aren’t
Octopuses have three hearts"""


In [303]:
sentences = funny_facts.splitlines()

In [304]:
sentences

['Turtles can breathe through their butts',
 'Sloths can hold their breath longer than dolphins',
 'Wombat poop is cube-shaped',
 'Bananas are berries, but strawberries aren’t',
 'Octopuses have three hearts']

In [305]:
' '. join(sentences)

'Turtles can breathe through their butts Sloths can hold their breath longer than dolphins Wombat poop is cube-shaped Bananas are berries, but strawberries aren’t Octopuses have three hearts'

In [306]:
paragraph = "\n".join(sentences)

In [307]:
print(paragraph)

Turtles can breathe through their butts
Sloths can hold their breath longer than dolphins
Wombat poop is cube-shaped
Bananas are berries, but strawberries aren’t
Octopuses have three hearts


In [308]:
word = '12345'

In [309]:
word

'12345'

In [310]:
word.isalpha()

False

In [311]:
word.isdigit()

True

In [312]:
word.isalnum()

True

In [313]:
word.isspace()

False

In [314]:
s = 'Hello, World!' 

In [315]:
s

'Hello, World!'

In [316]:
s.partition(',')

('Hello', ',', ' World!')

In [317]:
s1 = "Straße"
s2 = "STRASSE"

print("Lowercase:", s1.lower())  # 'straße'
print("Casefold:", s1.casefold())  # 'strasse'

# Case-insensitive comparison
print("Are they equal with lower()?", s1.lower() == s2.lower())  # False
print("Are they equal with casefold()?", s1.casefold() == s2.casefold())  # True

Lowercase: straße
Casefold: strasse
Are they equal with lower()? False
Are they equal with casefold()? True


In [318]:
print('Name'.ljust(15), 'Age')
print('John'.ljust(15, '.'), '34')
print('Sabi'.ljust(15, '>'), '21')

Name            Age
John........... 34
Sabi>>>>>>>>>>> 21


In [319]:
header = "Item".ljust(15) + "Price".rjust(10)
row1 = "Laptop".ljust(15) + "$1000".rjust(10)
row2 = "Phone".ljust(15) + "$500".rjust(10)

print(header)
print(row1)
print(row2)

Item                Price
Laptop              $1000
Phone                $500


####  Regular Expressions

####  String Functions in pandas

In [320]:
data = {
    'Product': ['  Apple iPhone  ', 'Samsung Galaxy', '   Google Pixel  ', None]
}
df = pd.DataFrame(data)

In [321]:
df

Unnamed: 0,Product
0,Apple iPhone
1,Samsung Galaxy
2,Google Pixel
3,


In [322]:
df['Product'].str.lower()

0       apple iphone  
1       samsung galaxy
2       google pixel  
3                 None
Name: Product, dtype: object

In [323]:
df['Product'].str.strip()

0      Apple iPhone
1    Samsung Galaxy
2      Google Pixel
3              None
Name: Product, dtype: object

In [324]:
data = {
    'Text': [
        '   Hello World!   ',
        'pandas IS AWESOME',
        'Data Science 101',
        None  # To show how missing values are handled
    ]
}
df = pd.DataFrame(data)

In [325]:
print(df)

                 Text
0     Hello World!   
1   pandas IS AWESOME
2    Data Science 101
3                None


In [326]:
df['Lower'] = df['Text'].str.lower()

In [327]:
df['Upper'] = df['Text'].str.upper()

In [328]:
df['Title'] = df['Text'].str.title()

In [329]:
df['Strip'] = df['Text'].str.strip()

In [330]:
df['Length'] = df['Text'].str.len()

In [331]:
print(df)

                 Text               Lower               Upper  \
0     Hello World!        hello world!        HELLO WORLD!      
1   pandas IS AWESOME   pandas is awesome   PANDAS IS AWESOME   
2    Data Science 101    data science 101    DATA SCIENCE 101   
3                None                None                None   

                Title              Strip  Length  
0     Hello World!          Hello World!    18.0  
1   Pandas Is Awesome  pandas IS AWESOME    17.0  
2    Data Science 101   Data Science 101    16.0  
3                None               None     NaN  


In [332]:
data = {
    "Name": ["Alice Johnson", "Bob Smith", "Charlie Brown", "David Williams",
             "Emma Wilson", "Franklin Harris", "Grace Lee", "Hannah Clark"],
    "Email": ["alice@google.com", "bob@email.com", "charliegoogle.com", "david@email.com",
              "emma@email.com", "franklin@email.com", "grace@email.com", None],
    "Age": [25, 30, 35, 40, 28, 33, 22, 27]
}

df = pd.DataFrame(data)

In [333]:
df

Unnamed: 0,Name,Email,Age
0,Alice Johnson,alice@google.com,25
1,Bob Smith,bob@email.com,30
2,Charlie Brown,charliegoogle.com,35
3,David Williams,david@email.com,40
4,Emma Wilson,emma@email.com,28
5,Franklin Harris,franklin@email.com,33
6,Grace Lee,grace@email.com,22
7,Hannah Clark,,27


In [334]:
# case=False makes the search case-insensitive
# na=False ensures that if there is a missing value (NaN), it is treated as False rather than propagating a missing value.
df['Email'].str.contains('@', case = False, na = False)

0     True
1     True
2    False
3     True
4     True
5     True
6     True
7    False
Name: Email, dtype: bool

In [335]:
# extracting
df[df['Email'].str.contains('@', case = False, na = False)]

Unnamed: 0,Name,Email,Age
0,Alice Johnson,alice@google.com,25
1,Bob Smith,bob@email.com,30
3,David Williams,david@email.com,40
4,Emma Wilson,emma@email.com,28
5,Franklin Harris,franklin@email.com,33
6,Grace Lee,grace@email.com,22


In [336]:
df['Email'].str.contains('@email.', case = False, na = False)

0    False
1     True
2    False
3     True
4     True
5     True
6     True
7    False
Name: Email, dtype: bool

In [337]:
df[df['Email'].str.contains('email', case = False, na = False)]

Unnamed: 0,Name,Email,Age
1,Bob Smith,bob@email.com,30
3,David Williams,david@email.com,40
4,Emma Wilson,emma@email.com,28
5,Franklin Harris,franklin@email.com,33
6,Grace Lee,grace@email.com,22


In [338]:
df['starts with "A"'] = df['Name'].str.startswith('A')

In [339]:
df

Unnamed: 0,Name,Email,Age,"starts with ""A"""
0,Alice Johnson,alice@google.com,25,True
1,Bob Smith,bob@email.com,30,False
2,Charlie Brown,charliegoogle.com,35,False
3,David Williams,david@email.com,40,False
4,Emma Wilson,emma@email.com,28,False
5,Franklin Harris,franklin@email.com,33,False
6,Grace Lee,grace@email.com,22,False
7,Hannah Clark,,27,False


In [340]:
df['ends with "n"'] = df['Name'].str.endswith('n')

In [341]:
df

Unnamed: 0,Name,Email,Age,"starts with ""A""","ends with ""n"""
0,Alice Johnson,alice@google.com,25,True,True
1,Bob Smith,bob@email.com,30,False,False
2,Charlie Brown,charliegoogle.com,35,False,True
3,David Williams,david@email.com,40,False,False
4,Emma Wilson,emma@email.com,28,False,True
5,Franklin Harris,franklin@email.com,33,False,False
6,Grace Lee,grace@email.com,22,False,False
7,Hannah Clark,,27,False,False


In [342]:
# filtering those ends with "n"
df[df['Name'].str.endswith('n')]

Unnamed: 0,Name,Email,Age,"starts with ""A""","ends with ""n"""
0,Alice Johnson,alice@google.com,25,True,True
2,Charlie Brown,charliegoogle.com,35,False,True
4,Emma Wilson,emma@email.com,28,False,True


In [343]:
data = {
    'Product': [
        'Samsung Galaxy',
        'Samsung Galaxy Note',
        'Apple iPhone',
        'Google Pixel'
    ]
}
df = pd.DataFrame(data)

In [344]:
df

Unnamed: 0,Product
0,Samsung Galaxy
1,Samsung Galaxy Note
2,Apple iPhone
3,Google Pixel


In [345]:
df['Product'].str.replace('Galaxy', 'Galaxy S')

0         Samsung Galaxy S
1    Samsung Galaxy S Note
2             Apple iPhone
3             Google Pixel
Name: Product, dtype: object

In [346]:
# I don't know let's replace all capital leters with "Q"
df['Product'].str.replace(r"[A-Z]", 'Q', regex = True)

0         Qamsung Qalaxy
1    Qamsung Qalaxy Qote
2           Qpple iQhone
3           Qoogle Qixel
Name: Product, dtype: object

In [347]:
data = {
    'Email': [
        'alice@example.com',
        'bob@sample.org',
        'charlie@test.net',
        'david@example.com'
    ]
}
df = pd.DataFrame(data)

In [348]:
df

Unnamed: 0,Email
0,alice@example.com
1,bob@sample.org
2,charlie@test.net
3,david@example.com


In [349]:
df['Domains'] = df['Email'].str.extract(r"@([\w.-]+)")

In [350]:
df

Unnamed: 0,Email,Domains
0,alice@example.com,example.com
1,bob@sample.org,sample.org
2,charlie@test.net,test.net
3,david@example.com,example.com


In [351]:
data = {
    'Description': [
        "There are 12 apples and 5 oranges.",
        "No numbers here!",
        "The price is 100 dollars and 50 cents."
    ]
}
df = pd.DataFrame(data)

In [352]:
df

Unnamed: 0,Description
0,There are 12 apples and 5 oranges.
1,No numbers here!
2,The price is 100 dollars and 50 cents.


In [353]:
df['numbers'] = df['Description'].str.findall(r"\d+")

In [354]:
df

Unnamed: 0,Description,numbers
0,There are 12 apples and 5 oranges.,"[12, 5]"
1,No numbers here!,[]
2,The price is 100 dollars and 50 cents.,"[100, 50]"


# Example

In [355]:
data = {
    "Full Name": ["Alice Johnson", "Bob Smith", "Charlie Brown", "David Williams", 
                  "Emma Wilson", "Franklin Harris", "Grace Lee", "Hannah Clark"],
    "Email": ["alice_1995@email.com", "bob.smith@email.co.uk", "charlie123@email.net", "david.w@email.com",
              "emma_wilson@email.org", "frank.harris@email.com", "gracelee@email.net", "hannah_c@email.com"],
    "Phone": ["(123) 456-7890", "+1 987-654-3210", "555-888-9999", "+44 20 7946 0958", 
              "(222) 333-4444", "666.777.8888", "+91-98765-43210", "111-222-3333"],
    "Address": ["123 Baker St, NY, USA", "456 Oak Ave, London, UK", "789 Pine Rd, TX, USA", 
                "101 Maple Dr, Toronto, CA", "55 Elm St, Sydney, AU", "999 Cedar Blvd, NY, USA",
                "10 Kings Way, Mumbai, IN", "777 Queen St, Vancouver, CA"],
    "Salary": ["$50,000", "$75,500", "€60,000", "£80,000", "$90,000", "€72,300", "₹1,200,000", "$45,000"]
}

df = pd.DataFrame(data)

In [356]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary
0,Alice Johnson,alice_1995@email.com,(123) 456-7890,"123 Baker St, NY, USA","$50,000"
1,Bob Smith,bob.smith@email.co.uk,+1 987-654-3210,"456 Oak Ave, London, UK","$75,500"
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA","€60,000"
3,David Williams,david.w@email.com,+44 20 7946 0958,"101 Maple Dr, Toronto, CA","£80,000"
4,Emma Wilson,emma_wilson@email.org,(222) 333-4444,"55 Elm St, Sydney, AU","$90,000"
5,Franklin Harris,frank.harris@email.com,666.777.8888,"999 Cedar Blvd, NY, USA","€72,300"
6,Grace Lee,gracelee@email.net,+91-98765-43210,"10 Kings Way, Mumbai, IN","₹1,200,000"
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA","$45,000"


In [357]:
df['Phone'] = df['Phone'].str.replace(r"[^\d]+", '', regex = True)

In [358]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary
0,Alice Johnson,alice_1995@email.com,1234567890,"123 Baker St, NY, USA","$50,000"
1,Bob Smith,bob.smith@email.co.uk,19876543210,"456 Oak Ave, London, UK","$75,500"
2,Charlie Brown,charlie123@email.net,5558889999,"789 Pine Rd, TX, USA","€60,000"
3,David Williams,david.w@email.com,442079460958,"101 Maple Dr, Toronto, CA","£80,000"
4,Emma Wilson,emma_wilson@email.org,2223334444,"55 Elm St, Sydney, AU","$90,000"
5,Franklin Harris,frank.harris@email.com,6667778888,"999 Cedar Blvd, NY, USA","€72,300"
6,Grace Lee,gracelee@email.net,919876543210,"10 Kings Way, Mumbai, IN","₹1,200,000"
7,Hannah Clark,hannah_c@email.com,1112223333,"777 Queen St, Vancouver, CA","$45,000"


In [359]:
# Convert all phone numbers to a standard format (XXX-XXX-XXXX)
df['Phone'] = df['Phone'].str.replace(r"(\d{3})(\d{3})(\d{4})", r"\1-\2-\3", regex = True)

In [360]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary
0,Alice Johnson,alice_1995@email.com,123-456-7890,"123 Baker St, NY, USA","$50,000"
1,Bob Smith,bob.smith@email.co.uk,198-765-43210,"456 Oak Ave, London, UK","$75,500"
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA","€60,000"
3,David Williams,david.w@email.com,442-079-460958,"101 Maple Dr, Toronto, CA","£80,000"
4,Emma Wilson,emma_wilson@email.org,222-333-4444,"55 Elm St, Sydney, AU","$90,000"
5,Franklin Harris,frank.harris@email.com,666-777-8888,"999 Cedar Blvd, NY, USA","€72,300"
6,Grace Lee,gracelee@email.net,919-876-543210,"10 Kings Way, Mumbai, IN","₹1,200,000"
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA","$45,000"


In [361]:
df['Domain'] = df['Email'].str.extract(r"@([\w.-]+)")

In [362]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary,Domain
0,Alice Johnson,alice_1995@email.com,123-456-7890,"123 Baker St, NY, USA","$50,000",email.com
1,Bob Smith,bob.smith@email.co.uk,198-765-43210,"456 Oak Ave, London, UK","$75,500",email.co.uk
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA","€60,000",email.net
3,David Williams,david.w@email.com,442-079-460958,"101 Maple Dr, Toronto, CA","£80,000",email.com
4,Emma Wilson,emma_wilson@email.org,222-333-4444,"55 Elm St, Sydney, AU","$90,000",email.org
5,Franklin Harris,frank.harris@email.com,666-777-8888,"999 Cedar Blvd, NY, USA","€72,300",email.com
6,Grace Lee,gracelee@email.net,919-876-543210,"10 Kings Way, Mumbai, IN","₹1,200,000",email.net
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA","$45,000",email.com


In [363]:
df["Country Code"] = df["Phone"].str.extract(r"(\d{1,3})[-\s]?\d{3}[-\s]?\d{4}")

In [364]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary,Domain,Country Code
0,Alice Johnson,alice_1995@email.com,123-456-7890,"123 Baker St, NY, USA","$50,000",email.com,123
1,Bob Smith,bob.smith@email.co.uk,198-765-43210,"456 Oak Ave, London, UK","$75,500",email.co.uk,198
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA","€60,000",email.net,555
3,David Williams,david.w@email.com,442-079-460958,"101 Maple Dr, Toronto, CA","£80,000",email.com,442
4,Emma Wilson,emma_wilson@email.org,222-333-4444,"55 Elm St, Sydney, AU","$90,000",email.org,222
5,Franklin Harris,frank.harris@email.com,666-777-8888,"999 Cedar Blvd, NY, USA","€72,300",email.com,666
6,Grace Lee,gracelee@email.net,919-876-543210,"10 Kings Way, Mumbai, IN","₹1,200,000",email.net,919
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA","$45,000",email.com,111


In [365]:
df['Currency'] = df['Salary'].str.extract(r"(.)[\d+,]")

In [366]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary,Domain,Country Code,Currency
0,Alice Johnson,alice_1995@email.com,123-456-7890,"123 Baker St, NY, USA","$50,000",email.com,123,$
1,Bob Smith,bob.smith@email.co.uk,198-765-43210,"456 Oak Ave, London, UK","$75,500",email.co.uk,198,$
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA","€60,000",email.net,555,€
3,David Williams,david.w@email.com,442-079-460958,"101 Maple Dr, Toronto, CA","£80,000",email.com,442,£
4,Emma Wilson,emma_wilson@email.org,222-333-4444,"55 Elm St, Sydney, AU","$90,000",email.org,222,$
5,Franklin Harris,frank.harris@email.com,666-777-8888,"999 Cedar Blvd, NY, USA","€72,300",email.com,666,€
6,Grace Lee,gracelee@email.net,919-876-543210,"10 Kings Way, Mumbai, IN","₹1,200,000",email.net,919,₹
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA","$45,000",email.com,111,$


In [367]:
df["Salary"] = df["Salary"].str.replace(r"[\$€£₹,]", "", regex=True).astype(float)

In [368]:
df

Unnamed: 0,Full Name,Email,Phone,Address,Salary,Domain,Country Code,Currency
0,Alice Johnson,alice_1995@email.com,123-456-7890,"123 Baker St, NY, USA",50000.0,email.com,123,$
1,Bob Smith,bob.smith@email.co.uk,198-765-43210,"456 Oak Ave, London, UK",75500.0,email.co.uk,198,$
2,Charlie Brown,charlie123@email.net,555-888-9999,"789 Pine Rd, TX, USA",60000.0,email.net,555,€
3,David Williams,david.w@email.com,442-079-460958,"101 Maple Dr, Toronto, CA",80000.0,email.com,442,£
4,Emma Wilson,emma_wilson@email.org,222-333-4444,"55 Elm St, Sydney, AU",90000.0,email.org,222,$
5,Franklin Harris,frank.harris@email.com,666-777-8888,"999 Cedar Blvd, NY, USA",72300.0,email.com,666,€
6,Grace Lee,gracelee@email.net,919-876-543210,"10 Kings Way, Mumbai, IN",1200000.0,email.net,919,₹
7,Hannah Clark,hannah_c@email.com,111-222-3333,"777 Queen St, Vancouver, CA",45000.0,email.com,111,$


In [369]:
df["Address"] = df["Address"].replace({
    "NY, USA": "New York, USA",
    "TX, USA": "Texas, USA",
    "London, UK": "London, United Kingdom",
    "Toronto, CA": "Toronto, Canada",
    "Sydney, AU": "Sydney, Australia",
    "Mumbai, IN": "Mumbai, India",
    "Vancouver, CA": "Vancouver, Canada"
}, regex=True)


In [370]:
print(df)

         Full Name                   Email           Phone  \
0    Alice Johnson    alice_1995@email.com    123-456-7890   
1        Bob Smith   bob.smith@email.co.uk   198-765-43210   
2    Charlie Brown    charlie123@email.net    555-888-9999   
3   David Williams       david.w@email.com  442-079-460958   
4      Emma Wilson   emma_wilson@email.org    222-333-4444   
5  Franklin Harris  frank.harris@email.com    666-777-8888   
6        Grace Lee      gracelee@email.net  919-876-543210   
7     Hannah Clark      hannah_c@email.com    111-222-3333   

                               Address     Salary       Domain Country Code  \
0          123 Baker St, New York, USA    50000.0    email.com          123   
1  456 Oak Ave, London, United Kingdom    75500.0  email.co.uk          198   
2              789 Pine Rd, Texas, USA    60000.0    email.net          555   
3        101 Maple Dr, Toronto, Canada    80000.0    email.com          442   
4         55 Elm St, Sydney, Australia    9000

# 7.5 Categorical Data

In [371]:
# let's take repeated values of a smaller data set
values = pd.Series(['apple', 'apple', 'mango'] * 3)

In [372]:
values

0    apple
1    apple
2    mango
3    apple
4    apple
5    mango
6    apple
7    apple
8    mango
dtype: object

In [373]:
# unique values
values.unique()

array(['apple', 'mango'], dtype=object)

In [374]:
# counting each occurence
values.value_counts()

apple    6
mango    3
Name: count, dtype: int64

In [375]:
values = pd.Series([0, 0, 1] * 3)

In [376]:
print(values)

0    0
1    0
2    1
3    0
4    0
5    1
6    0
7    0
8    1
dtype: int64


In [377]:
dim = pd.Series(['apple', 'mango'])

In [378]:
print(dim)

0    apple
1    mango
dtype: object


In [379]:
# We can use the take method to restore the original Series of strings
dim.take(values)

0    apple
0    apple
1    mango
0    apple
0    apple
1    mango
0    apple
0    apple
1    mango
dtype: object

### Caregorical extension type

In [380]:
# for example
colors = pd.Series(["Red", "Blue", "Red", "Green", "Blue", "Red"])

In [381]:
colors

0      Red
1     Blue
2      Red
3    Green
4     Blue
5      Red
dtype: object

In [382]:
# let's change its data type to "category"
cat_colors = colors.astype("category")

In [383]:
cat_colors

0      Red
1     Blue
2      Red
3    Green
4     Blue
5      Red
dtype: category
Categories (3, object): ['Blue', 'Green', 'Red']

In [384]:
color = ["Red", "Blue", "Red", "Green", "Blue", "Red"] * 2

In [385]:
n = len(color)

In [386]:
rng = np.random.default_rng(seed = 321)

In [387]:
df = pd.DataFrame({'Color': color,
                 'bucket_id': np.arange(n), 
                  'count': rng.integers(5, 20, size = n),
                  'volume': rng.uniform(0, 4, size = n)},
                 columns = ['bucket_id', 'Color', 'count', 'volume'])

In [388]:
df

Unnamed: 0,bucket_id,Color,count,volume
0,0,Red,10,2.731922
1,1,Blue,14,0.751767
2,2,Red,6,1.469566
3,3,Green,18,3.84782
4,4,Blue,14,3.833262
5,5,Red,12,1.538547
6,6,Red,10,3.09955
7,7,Blue,15,1.62862
8,8,Red,12,3.568198
9,9,Green,13,3.604558


In [389]:
df['Color'].dtype

dtype('O')

In [390]:
color_cat = df['Color'].astype('category')

In [391]:
color_cat

0       Red
1      Blue
2       Red
3     Green
4      Blue
5       Red
6       Red
7      Blue
8       Red
9     Green
10     Blue
11      Red
Name: Color, dtype: category
Categories (3, object): ['Blue', 'Green', 'Red']

In [392]:
c = color_cat.array

In [393]:
type(c)

pandas.core.arrays.categorical.Categorical

In [394]:
c.categories

Index(['Blue', 'Green', 'Red'], dtype='object')

In [395]:
c.codes

array([2, 0, 2, 1, 0, 2, 2, 0, 2, 1, 0, 2], dtype=int8)

In [396]:
dict(enumerate(c.categories))

{0: 'Blue', 1: 'Green', 2: 'Red'}

In [397]:
df['Color'] = df['Color'].astype('category')

In [398]:
df

Unnamed: 0,bucket_id,Color,count,volume
0,0,Red,10,2.731922
1,1,Blue,14,0.751767
2,2,Red,6,1.469566
3,3,Green,18,3.84782
4,4,Blue,14,3.833262
5,5,Red,12,1.538547
6,6,Red,10,3.09955
7,7,Blue,15,1.62862
8,8,Red,12,3.568198
9,9,Green,13,3.604558


In [399]:
fruit = pd.Categorical(['apple', 'mango', 'apple', 'orange', 'orange'],
                       categories = ['mango', 'apple', 'orange'],
                       ordered = True)

In [400]:
fruit

['apple', 'mango', 'apple', 'orange', 'orange']
Categories (3, object): ['mango' < 'apple' < 'orange']

In [401]:
categories = ['foo', 'bar', 'baz']

In [402]:
codes = [0, 1, 2, 0, 0, 1]

In [403]:
my_cats = pd.Categorical.from_codes(codes, categories, ordered = True)

In [404]:
my_cats

['foo', 'bar', 'baz', 'foo', 'foo', 'bar']
Categories (3, object): ['foo' < 'bar' < 'baz']

####  Computations with Categoricals

In [405]:
# let's take example
data = np.random.standard_normal(1000)
df = pd.DataFrame({'draws': data})

In [406]:
df.head()

Unnamed: 0,draws
0,-0.710262
1,-1.051959
2,0.881355
3,0.333216
4,-0.001387


In [407]:
df['Quartile'] = pd.qcut(df['draws'], 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])

In [408]:
df.head()

Unnamed: 0,draws,Quartile
0,-0.710262,Q1
1,-1.051959,Q1
2,0.881355,Q4
3,0.333216,Q3
4,-0.001387,Q2


In [409]:
result = df.groupby('Quartile')['draws'].agg(['count', 'min', 'max', 'mean'])

  result = df.groupby('Quartile')['draws'].agg(['count', 'min', 'max', 'mean'])


In [410]:
result

Unnamed: 0_level_0,count,min,max,mean
Quartile,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Q1,250,-3.315164,-0.689451,-1.271944
Q2,250,-0.687701,0.026794,-0.296266
Q3,250,0.032041,0.706783,0.34033
Q4,250,0.709675,3.219936,1.250098


In [411]:
n = 10000000

In [412]:
labels = pd.Series(['red', 'blue', 'red', 'orange'] * (n//2))

In [413]:
labels.memory_usage(deep = True)

1220000132

In [414]:
c = labels.astype('category')

In [415]:
c.memory_usage(deep = True)

20000424

In [416]:
labels.memory_usage(deep = True)/c.memory_usage(deep = True)

60.99871342727534

In [417]:
s = pd.Series(['A', 'B', 'A', 'C'] * 2)

In [418]:
c = s.astype('category')

In [419]:
c

0    A
1    B
2    A
3    C
4    A
5    B
6    A
7    C
dtype: category
Categories (3, object): ['A', 'B', 'C']

In [420]:
c.cat.codes

0    0
1    1
2    0
3    2
4    0
5    1
6    0
7    2
dtype: int8

In [421]:
c.cat.categories

Index(['A', 'B', 'C'], dtype='object')

In [422]:
c.value_counts()

A    4
B    2
C    2
Name: count, dtype: int64

In [423]:
s = pd.Series(['a', 'b', 'c', 'd'] * 2, dtype='category')

In [424]:
s

0    a
1    b
2    c
3    d
4    a
5    b
6    c
7    d
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [425]:
pd.get_dummies(s)

Unnamed: 0,a,b,c,d
0,True,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,False,False,True
4,True,False,False,False
5,False,True,False,False
6,False,False,True,False
7,False,False,False,True


In [426]:
s = pd.Series(["a", "b", "a", "c", "b", "a", None])

In [427]:
s_cat = s.astype('category')

In [428]:
s_cat

0      a
1      b
2      a
3      c
4      b
5      a
6    NaN
dtype: category
Categories (3, object): ['a', 'b', 'c']

In [429]:
s_cat_reordered = s_cat.cat.reorder_categories(["a", "b", "c"], ordered=True)

In [430]:
s_cat_reordered

0      a
1      b
2      a
3      c
4      b
5      a
6    NaN
dtype: category
Categories (3, object): ['a' < 'b' < 'c']

In [431]:
s_cat_renamed = s_cat.cat.rename_categories(lambda x: x.upper())

In [432]:
s_cat_renamed

0      A
1      B
2      A
3      C
4      B
5      A
6    NaN
dtype: category
Categories (3, object): ['A', 'B', 'C']

In [433]:
s_cat_extended = s_cat.cat.add_categories(["d"])

In [434]:
s_cat_extended

0      a
1      b
2      a
3      c
4      b
5      a
6    NaN
dtype: category
Categories (4, object): ['a', 'b', 'c', 'd']

In [435]:
s_cat_reduced = s_cat_extended.cat.remove_categories(["c"])

In [436]:
s_cat_reduced

0      a
1      b
2      a
3    NaN
4      b
5      a
6    NaN
dtype: category
Categories (3, object): ['a', 'b', 'd']

In [437]:
s_cat_ordered = s_cat.cat.as_ordered()

In [438]:
s_cat_ordered.cat.ordered

True

# Examples

In [439]:
data = {
    'CustomerID': range(1, 11),
    'Region': ['North', 'South', 'East', 'West', 'North', 'East', 'South', 'West', 'North', 'East'],
    'Customer_Type': ['Regular', 'Premium', 'Regular', 'VIP', 'Regular', 'Premium', 'VIP', 'Regular', 'VIP', 'Premium'],
    'Purchase_Amount': [250, 400, 150, 600, 300, 350, 700, 200, 500, 450]
}
df = pd.DataFrame(data)

In [440]:
df

Unnamed: 0,CustomerID,Region,Customer_Type,Purchase_Amount
0,1,North,Regular,250
1,2,South,Premium,400
2,3,East,Regular,150
3,4,West,VIP,600
4,5,North,Regular,300
5,6,East,Premium,350
6,7,South,VIP,700
7,8,West,Regular,200
8,9,North,VIP,500
9,10,East,Premium,450


In [441]:
# converting "Region" and "Customer_Type" to categorical
df['Region'] = df['Region'].astype('category')
df['Customer_Type'] = df['Customer_Type'].astype('category')

In [442]:
# Group by Region and Customer_Type to compute total purchase amount
total = df.groupby(['Region', 'Customer_Type'], observed=False)['Purchase_Amount'].sum()

In [443]:
total

Region  Customer_Type
East    Premium          800
        Regular          150
        VIP                0
North   Premium            0
        Regular          550
        VIP              500
South   Premium          400
        Regular            0
        VIP              700
West    Premium            0
        Regular          200
        VIP              600
Name: Purchase_Amount, dtype: int64

In [444]:
data = {
    'Respondent': range(1239, 2239),
    'Satisfaction': np.random.choice(['Good', 'Poor', 'Excellent', 'Fair', 'Good', 'Fair', 'Excellent'], 1000)
}
df = pd.DataFrame(data)

In [445]:
df.head()

Unnamed: 0,Respondent,Satisfaction
0,1239,Fair
1,1240,Good
2,1241,Fair
3,1242,Good
4,1243,Good


In [446]:
# change the data type of "Satisfaction" to "category"
df['Satisfaction'] = df['Satisfaction'].astype('category')

In [447]:
df['Satisfaction'] = df['Satisfaction'].cat.reorder_categories(['Poor', 'Fair', 'Good', 'Excellent'], ordered = True)

In [448]:
df.sort_values(by = 'Satisfaction')

Unnamed: 0,Respondent,Satisfaction
896,2135,Poor
822,2061,Poor
113,1352,Poor
219,1458,Poor
442,1681,Poor
...,...,...
735,1974,Excellent
339,1578,Excellent
341,1580,Excellent
323,1562,Excellent


In [449]:
df['Satisfaction'].value_counts().sort_index()

Satisfaction
Poor         135
Fair         285
Good         306
Excellent    274
Name: count, dtype: int64