# Inconsistencies in Data

In [24]:
import pandas as pd

In [25]:
data = {
    'date': ['2021-12-01', '01-12-2021', '2021/12/01', '12-01-2021'],
    'country': ['USA', 'U.S.A', 'America', 'United States'],
    'name': ['John Doe', 'Jonh Doe', 'Jane Doe', 'Jane Doe'],
    'sales_2020': [100, 200, None, 200],
    'sales_2021': [None, 150, 300, 150]
    
}

In [26]:
df = pd.DataFrame(data)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,01-12-2021,U.S.A,Jonh Doe,200.0,150.0
2,2021/12/01,America,Jane Doe,,300.0
3,12-01-2021,United States,Jane Doe,200.0,150.0


In [27]:
# Standardizing the date format
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,NaT,U.S.A,Jonh Doe,200.0,150.0
2,NaT,America,Jane Doe,,300.0
3,NaT,United States,Jane Doe,200.0,150.0


In [28]:
# Manually setting missing dates
df.loc[1, 'date'] = '2021-12-01'
df.loc[2, 'date'] = '2021-12-01'
df.loc[3, 'date'] = '2021-12-01'

# Converting the new entries to datetime format
df['date'] = pd.to_datetime(df['date'])

# Displaying the updated DataFrame
df.head()


Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,USA,John Doe,100.0,
1,2021-12-01,U.S.A,Jonh Doe,200.0,150.0
2,2021-12-01,America,Jane Doe,,300.0
3,2021-12-01,United States,Jane Doe,200.0,150.0


In [29]:
# Harmonize the name of country
country_mapping = {'USA' : 'United States', 'U.S.A' : 'United States', 'America': 'United States'}
df['country'] = df['country'].replace(country_mapping)

In [30]:
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,John Doe,100.0,
1,2021-12-01,United States,Jonh Doe,200.0,150.0
2,2021-12-01,United States,Jane Doe,,300.0
3,2021-12-01,United States,Jane Doe,200.0,150.0


In [31]:
# Correct the typographical mistakes in name
df['name'] = df['name'].replace({'Jonh Doe' : 'John Doe'})
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,John Doe,100.0,
1,2021-12-01,United States,John Doe,200.0,150.0
2,2021-12-01,United States,Jane Doe,,300.0
3,2021-12-01,United States,Jane Doe,200.0,150.0


In [32]:
# Resolving contradactory data
df = df.drop(df[df['sales_2021'] <= df['sales_2020']].index)
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,John Doe,100.0,
2,2021-12-01,United States,Jane Doe,,300.0


In [14]:
# Remove duplicates
df = df.drop_duplicates(subset='name')
df.head()

Unnamed: 0,date,country,name,sales_2020,sales_2021
0,2021-12-01,United States,John Doe,100.0,
2,2021-12-01,United States,Jane Doe,,300.0
