In [5]:
import pandas as pd
data = {
    'name': ['Ram ', 'Shyam', 'Hari', None, 'SITA'],
    'age': ['25', '30', None, '40', '-5'],
    'salary': [20000, 25000, None, 40000, 15000],
    'city': ['ktm', 'KTM', 'pokhara', 'Biratnagar', None],
    'join_date': ['2020/01/01', '2021/05/03', None, '2019/12/15', '2021/01/20']
}

df = pd.DataFrame(data)

print("Uncleaned data:")
print(df)

Uncleaned data:
    name   age   salary        city   join_date
0   Ram     25  20000.0         ktm  2020/01/01
1  Shyam    30  25000.0         KTM  2021/05/03
2   Hari  None      NaN     pokhara        None
3   None    40  40000.0  Biratnagar  2019/12/15
4   SITA    -5  15000.0        None  2021/01/20


In [6]:
# Step 1: Clean Text
df['name'] = df['name'].str.strip().str.title()

# Step 2: Fix Data Types
df['age'] = pd.to_numeric(df['age'], errors='coerce')
df['join_date'] = pd.to_datetime(df['join_date'], errors='coerce')

# Step 3: Handle Missing Values
df['age'] = df['age'].fillna(df['age'].median())
df['salary'] = df['salary'].fillna(df['salary'].median())
df['city'] = df['city'].fillna('Unknown')

# Step 4: Correct Incorrect Values**
df.loc[df['age'] < 0, 'age'] = df['age'].median()

# Step 5: Fix Categorical Inconsistency
df['city'] = df['city'].str.title()
df['city'].replace({'Ktm': 'Kathmandu', 'Ktm ': 'Kathmandu', \
    'KTM': 'Kathmandu'}, inplace=True)

# Step 6: Feature Engineering
df['year_joined'] = df['join_date'].dt.year
df['salary_lakh'] = df['salary'] / 100000

print("Cleaned data:")
print(df)

Cleaned data:
    name   age   salary        city  join_date  year_joined  salary_lakh
0    Ram  25.0  20000.0   Kathmandu 2020-01-01       2020.0        0.200
1  Shyam  30.0  25000.0   Kathmandu 2021-05-03       2021.0        0.250
2   Hari  27.5  22500.0     Pokhara        NaT          NaN        0.225
3   None  40.0  40000.0  Biratnagar 2019-12-15       2019.0        0.400
4   Sita  27.5  15000.0     Unknown 2021-01-20       2021.0        0.150


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['city'].replace({'Ktm': 'Kathmandu', 'Ktm ': 'Kathmandu', \
