In [1]:
data = { 'date': ['2021-12-01', '01-12-2022', 'dec-22-2022', '2021/12/12'],
        'country': ['USA', 'UK', 'United States of America', 'UK'],
         'name': ['nazra', 'nazra', 'Nazar', 'naz'],
         'age': [21, 22, 23, 24],
         'city': ['lahore', 'karach', 'lahore', 'lahore'],
         'sale': [100, None, 300, 400] }

data

{'date': ['2021-12-01', '01-12-2022', 'dec-22-2022', '2021/12/12'],
 'country': ['USA', 'UK', 'United States of America', 'UK'],
 'name': ['nazra', 'nazra', 'Nazar', 'naz'],
 'age': [21, 22, 23, 24],
 'city': ['lahore', 'karach', 'lahore', 'lahore'],
 'sale': [100, None, 300, 400]}

In [12]:
# convert data into pandas dataframe
import pandas as pd
df = pd.DataFrame(data)
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,USA,nazra,21,lahore,100.0
1,01-12-2022,UK,nazra,22,karach,
2,dec-22-2022,United States of America,Nazar,23,lahore,300.0
3,2021/12/12,UK,naz,24,lahore,400.0


In [13]:
# 1. Standardize the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,USA,nazra,21,lahore,100.0
1,,UK,nazra,22,karach,
2,,United States of America,Nazar,23,lahore,300.0
3,,UK,naz,24,lahore,400.0


In [14]:
# 2. Harmonize the name of the country
country_mapping = { 'USA': 'United States of America',
                    'United States of America': 'United States of America',
                    'UK': 'United Kingdom' }
df['country'] = df['country'].replace(country_mapping)
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,United States of America,nazra,21,lahore,100.0
1,,United Kingdom,nazra,22,karach,
2,,United States of America,Nazar,23,lahore,300.0
3,,United Kingdom,naz,24,lahore,400.0


In [15]:
# 3. Correct typographical mistake in name
df['name'] = df['name'].replace({'naz': 'nazra', 'Nazar': 'nazra'})
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,United States of America,nazra,21,lahore,100.0
1,,United Kingdom,nazra,22,karach,
2,,United States of America,nazra,23,lahore,300.0
3,,United Kingdom,nazra,24,lahore,400.0


In [17]:
# 4. Drop Duplicates
df.drop_duplicates(inplace=True, subset=['name', 'age', 'city'])
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,United States of America,nazra,21,lahore,100.0
1,,United Kingdom,nazra,22,karach,
2,,United States of America,nazra,23,lahore,300.0
3,,United Kingdom,nazra,24,lahore,400.0


In [18]:
# 5. Drop Contradictory data
df = df[df['sale'] >= 0]
df

Unnamed: 0,date,country,name,age,city,sale
0,2021-12-01,United States of America,nazra,21,lahore,100.0
2,,United States of America,nazra,23,lahore,300.0
3,,United Kingdom,nazra,24,lahore,400.0
