In [14]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Edward', 'Alice', 'George', np.nan, 'Ivy', 'Jack'],
    'Age': [25, 30, 35, 40, np.nan, 25, 50, 60, 55, 'Twenty'],
    'Salary': ['50000$', '60000$', '70000$', '80000$', '90000$', '50000$', np.nan, '120000$', '110000$', '100000$'],
    'JoinDate': ['2020-01-15', '2019-05-20', '2018-03-10', np.nan, '2021-07-01', '2020-01-15', '2017-11-25', '2016-08-15', '2015-04-10', '2014-12-30']
}

df = pd.DataFrame(data)
df.to_csv('sample_data.csv', index=False)


In [15]:
df = pd.read_csv("sample_data.csv")
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25,50000$,2020-01-15
1,Bob,30,60000$,2019-05-20
2,Charlie,35,70000$,2018-03-10
3,David,40,80000$,
4,Edward,,90000$,2021-07-01
5,Alice,25,50000$,2020-01-15
6,George,50,,2017-11-25
7,,60,120000$,2016-08-15
8,Ivy,55,110000$,2015-04-10
9,Jack,Twenty,100000$,2014-12-30


In [16]:
# Handling Missing Values
print("Missing values are:")
print(df.isnull().sum())
df

Missing values are:
Name        1
Age         1
Salary      1
JoinDate    1
dtype: int64


Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25,50000$,2020-01-15
1,Bob,30,60000$,2019-05-20
2,Charlie,35,70000$,2018-03-10
3,David,40,80000$,
4,Edward,,90000$,2021-07-01
5,Alice,25,50000$,2020-01-15
6,George,50,,2017-11-25
7,,60,120000$,2016-08-15
8,Ivy,55,110000$,2015-04-10
9,Jack,Twenty,100000$,2014-12-30


In [17]:
# Fill missing values in 'Age' with the mean age
df['Age']=pd.to_numeric(df['Age'],errors = 'coerce')
df['Age'].fillna(df['Age'].mean(),inplace = True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25.0,50000$,2020-01-15
1,Bob,30.0,60000$,2019-05-20
2,Charlie,35.0,70000$,2018-03-10
3,David,40.0,80000$,
4,Edward,40.0,90000$,2021-07-01
5,Alice,25.0,50000$,2020-01-15
6,George,50.0,,2017-11-25
7,,60.0,120000$,2016-08-15
8,Ivy,55.0,110000$,2015-04-10
9,Jack,40.0,100000$,2014-12-30


In [18]:
# Fill missing values in 'JoinDate' with a default date
df['JoinDate'].fillna('2022-01-01',inplace = True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25.0,50000$,2020-01-15
1,Bob,30.0,60000$,2019-05-20
2,Charlie,35.0,70000$,2018-03-10
3,David,40.0,80000$,2022-01-01
4,Edward,40.0,90000$,2021-07-01
5,Alice,25.0,50000$,2020-01-15
6,George,50.0,,2017-11-25
7,,60.0,120000$,2016-08-15
8,Ivy,55.0,110000$,2015-04-10
9,Jack,40.0,100000$,2014-12-30


In [19]:
# Dorp rows with missing "Name"
df.dropna(subset=['Name'],inplace=True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25.0,50000$,2020-01-15
1,Bob,30.0,60000$,2019-05-20
2,Charlie,35.0,70000$,2018-03-10
3,David,40.0,80000$,2022-01-01
4,Edward,40.0,90000$,2021-07-01
5,Alice,25.0,50000$,2020-01-15
6,George,50.0,,2017-11-25
8,Ivy,55.0,110000$,2015-04-10
9,Jack,40.0,100000$,2014-12-30


In [21]:
#  Removing Duplicates
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Name,Age,Salary,JoinDate
0,Alice,25.0,50000$,2020-01-15
1,Bob,30.0,60000$,2019-05-20
2,Charlie,35.0,70000$,2018-03-10
3,David,40.0,80000$,2022-01-01
4,Edward,40.0,90000$,2021-07-01
6,George,50.0,,2017-11-25
8,Ivy,55.0,110000$,2015-04-10
9,Jack,40.0,100000$,2014-12-30


In [22]:
# Renaming Columns
df.rename(columns={'JoinDate':'Join_Date'},inplace=True)
df

Unnamed: 0,Name,Age,Salary,Join_Date
0,Alice,25.0,50000$,2020-01-15
1,Bob,30.0,60000$,2019-05-20
2,Charlie,35.0,70000$,2018-03-10
3,David,40.0,80000$,2022-01-01
4,Edward,40.0,90000$,2021-07-01
6,George,50.0,,2017-11-25
8,Ivy,55.0,110000$,2015-04-10
9,Jack,40.0,100000$,2014-12-30


In [23]:
# Changing Data Types
df['Join_Date']=pd.to_datetime(df['Join_Date'])
df['Salary']=df['Salary'].str.replace('$','').astype(float)
df

Unnamed: 0,Name,Age,Salary,Join_Date
0,Alice,25.0,50000.0,2020-01-15
1,Bob,30.0,60000.0,2019-05-20
2,Charlie,35.0,70000.0,2018-03-10
3,David,40.0,80000.0,2022-01-01
4,Edward,40.0,90000.0,2021-07-01
6,George,50.0,,2017-11-25
8,Ivy,55.0,110000.0,2015-04-10
9,Jack,40.0,100000.0,2014-12-30


In [24]:
# Filtering Data
# For demonstration, we'll keep rows where Age is greater than 30
df = df[df['Age']>30]
df

Unnamed: 0,Name,Age,Salary,Join_Date
2,Charlie,35.0,70000.0,2018-03-10
3,David,40.0,80000.0,2022-01-01
4,Edward,40.0,90000.0,2021-07-01
6,George,50.0,,2017-11-25
8,Ivy,55.0,110000.0,2015-04-10
9,Jack,40.0,100000.0,2014-12-30


In [27]:
df['Name'].replace('George','Greg',inplace = True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Name'].replace('George','Greg',inplace = True)


Unnamed: 0,Name,Age,Salary,Join_Date
2,Charlie,35.0,70000.0,2018-03-10
3,David,40.0,80000.0,2022-01-01
4,Edward,40.0,90000.0,2021-07-01
6,Greg,50.0,,2017-11-25
8,Ivy,55.0,110000.0,2015-04-10
9,Jack,40.0,100000.0,2014-12-30


In [29]:
#  Working with Dates
df['Year_Joined'] = df['Join_Date'].dt.year
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year_Joined'] = df['Join_Date'].dt.year


Unnamed: 0,Name,Age,Salary,Join_Date,Year_Joined
2,Charlie,35.0,70000.0,2018-03-10,2018
3,David,40.0,80000.0,2022-01-01,2022
4,Edward,40.0,90000.0,2021-07-01,2021
6,Greg,50.0,,2017-11-25,2017
8,Ivy,55.0,110000.0,2015-04-10,2015
9,Jack,40.0,100000.0,2014-12-30,2014


In [30]:
# Aggregating Data
age_salary_mean=df.groupby('Year_Joined')['Salary'].mean()
df

Unnamed: 0,Name,Age,Salary,Join_Date,Year_Joined
2,Charlie,35.0,70000.0,2018-03-10,2018
3,David,40.0,80000.0,2022-01-01,2022
4,Edward,40.0,90000.0,2021-07-01,2021
6,Greg,50.0,,2017-11-25,2017
8,Ivy,55.0,110000.0,2015-04-10,2015
9,Jack,40.0,100000.0,2014-12-30,2014


In [31]:
# Handling Outliers
df['Salary']=df['Salary'].clip(lower=50000,upper=100000)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Salary']=df['Salary'].clip(lower=50000,upper=100000)


Unnamed: 0,Name,Age,Salary,Join_Date,Year_Joined
2,Charlie,35.0,70000.0,2018-03-10,2018
3,David,40.0,80000.0,2022-01-01,2022
4,Edward,40.0,90000.0,2021-07-01,2021
6,Greg,50.0,,2017-11-25,2017
8,Ivy,55.0,100000.0,2015-04-10,2015
9,Jack,40.0,100000.0,2014-12-30,2014


In [33]:
# Display the cleaned dataframe
print("\Cleaned DataFrame:")
print(df)

\Cleaned DataFrame:
      Name   Age    Salary  Join_Date  Year_Joined
2  Charlie  35.0   70000.0 2018-03-10         2018
3    David  40.0   80000.0 2022-01-01         2022
4   Edward  40.0   90000.0 2021-07-01         2021
6     Greg  50.0       NaN 2017-11-25         2017
8      Ivy  55.0  100000.0 2015-04-10         2015
9     Jack  40.0  100000.0 2014-12-30         2014


In [36]:
# Save the cleaned dataframe
df.to_csv('cleaned_datset',index=False)