In [16]:
import pandas as pd
import numpy as np
df = pd.DataFrame.from_dict({
'Name': ['Nik', 'Kate', 'Evan', 'Kyra', np.NaN],
'Age': [33, 32, 40, 57, np.NaN],
'Location': ['Toronto', 'London', 'New York', np.NaN, np.NaN]
})
print(df)

# Returns:
#    Name   Age  Location
# 0   Nik  33.0   Toronto
# 1  Kate  32.0    London
# 2  Evan  40.0  New York
# 3  Kyra  57.0       NaN
# 4   NaN   NaN       NaN

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York
3  Kyra  57.0       NaN
4   NaN   NaN       NaN


In [2]:
print(df.isnull())

    Name    Age  Location
0  False  False     False
1  False  False     False
2  False  False     False
3  False  False      True
4   True   True      True


In [3]:
print(df.isnull().sum())

Name        1
Age         1
Location    2
dtype: int64


In [4]:
df.dropna(
    axis=0,         # Whether to drop rows or columns
    how='any',      # Whether to drop records if 'all' or 'any' records are missing
    thresh=None,    # How many columns/rows must be missing to drop
    subset=None,    # Which rows/columns to consider
)

Unnamed: 0,Name,Age,Location
0,Nik,33.0,Toronto
1,Kate,32.0,London
2,Evan,40.0,New York


In [5]:
df = df.dropna()
print(df)
# Returns:
#    Name   Age  Location
# 0   Nik  33.0   Toronto
# 1  Kate  32.0    London
# 2  Evan  40.0  New York

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York


In [8]:
df = df.dropna(how='all')
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York
3  Kyra  57.0       NaN


In [11]:
df = df.fillna(0)
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York
3  Kyra  57.0         0
4     0   0.0         0


In [14]:
df = df.fillna({'Name': 'Someone', 'Age': 25, 'Location': 'USA'})
print(df)

      Name   Age  Location
0      Nik  33.0   Toronto
1     Kate  32.0    London
2     Evan  40.0  New York
3     Kyra  57.0       USA
4  Someone  25.0       USA


In [17]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)

   Name   Age  Location
0   Nik  33.0   Toronto
1  Kate  32.0    London
2  Evan  40.0  New York
3  Kyra  57.0       NaN
4   NaN  40.5       NaN


In [18]:
import pandas as pd
df = pd.DataFrame.from_dict({
'Name': ['Nik', 'Kate', 'Evan', 'Kyra', 'Nik', 'Kate'],
'Age': [33, 32, 40, 57, 33, 32],
'Location': ['Toronto', 'London', 'New York', 'Atlanta', 'Toronto', 'Paris'],
'Date Modified': ['2022‐01‐01', '2022‐02‐24', '2022‐08‐12', '2022‐09‐12', '2022‐01‐01', '2022‐12‐09']
})
print(df)

   Name  Age  Location Date Modified
0   Nik   33   Toronto    2022‐01‐01
1  Kate   32    London    2022‐02‐24
2  Evan   40  New York    2022‐08‐12
3  Kyra   57   Atlanta    2022‐09‐12
4   Nik   33   Toronto    2022‐01‐01
5  Kate   32     Paris    2022‐12‐09


In [19]:
print(df.duplicated())

0    False
1    False
2    False
3    False
4     True
5    False
dtype: bool


In [20]:
print(df.duplicated().sum())

1


In [21]:
df.drop_duplicates(
    subset=None,            # Which columns to consider
    keep='first',           # Which duplicate record to keep
    inplace=False,          # Whether to drop in place
    ignore_index=False # Whether to relabel the index
)

Unnamed: 0,Name,Age,Location,Date Modified
0,Nik,33,Toronto,2022‐01‐01
1,Kate,32,London,2022‐02‐24
2,Evan,40,New York,2022‐08‐12
3,Kyra,57,Atlanta,2022‐09‐12
5,Kate,32,Paris,2022‐12‐09


In [22]:
df = df.drop_duplicates()
print(df)

   Name  Age  Location Date Modified
0   Nik   33   Toronto    2022‐01‐01
1  Kate   32    London    2022‐02‐24
2  Evan   40  New York    2022‐08‐12
3  Kyra   57   Atlanta    2022‐09‐12
5  Kate   32     Paris    2022‐12‐09


In [23]:
df = df.sort_values(by='Date Modified', ascending=False)
df = df.drop_duplicates(subset=['Name', 'Age'], keep='first')
print(df)

   Name  Age  Location Date Modified
5  Kate   32     Paris    2022‐12‐09
3  Kyra   57   Atlanta    2022‐09‐12
2  Evan   40  New York    2022‐08‐12
0   Nik   33   Toronto    2022‐01‐01


In [26]:
import pandas as pd
df = pd.DataFrame.from_dict({
'Name': ['Tranter, Melvyn', 'Lana, Courtney', 'Abel, Shakti', 'Vasu, Imogene','Aravind, Shelly'],
'Region': ['Region A', 'Region A', 'Region B', 'Region C', 'Region D'],
'Location': ['TORONTO', 'LONDON', 'New york', 'ATLANTA', 'toronto'],
'Favorite Color': ['   green  ', 'red', '  yellow', 'blue', 'purple  ']
})
print(df)
# Returns:
#               Name    Region  Location Favorite Color
# 0  Tranter, Melvyn  Region A   TORONTO        green  
# 1   Lana, Courtney  Region A    LONDON            red
# 2     Abel, Shakti  Region B  New york         yellow
# 3    Vasu, Imogene  Region C   ATLANTA           blue
# 4  Aravind, Shelly  Region D   toronto       purple

              Name    Region  Location Favorite Color
0  Tranter, Melvyn  Region A   TORONTO        green  
1   Lana, Courtney  Region A    LONDON            red
2     Abel, Shakti  Region B  New york         yellow
3    Vasu, Imogene  Region C   ATLANTA           blue
4  Aravind, Shelly  Region D   toronto       purple  


In [27]:
df['Favorite Color'] = df['Favorite Color'].str.strip()
print(df)

              Name    Region  Location Favorite Color
0  Tranter, Melvyn  Region A   TORONTO          green
1   Lana, Courtney  Region A    LONDON            red
2     Abel, Shakti  Region B  New york         yellow
3    Vasu, Imogene  Region C   ATLANTA           blue
4  Aravind, Shelly  Region D   toronto         purple


In [28]:
print(df['Name'].str.split(','))

0    [Tranter,  Melvyn]
1     [Lana,  Courtney]
2       [Abel,  Shakti]
3      [Vasu,  Imogene]
4    [Aravind,  Shelly]
Name: Name, dtype: object


In [29]:
df[['Last Name', 'First Name']] = df['Name'].str.split(',', expand=True)
print(df)

              Name    Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn  Region A   TORONTO          green   Tranter     Melvyn
1   Lana, Courtney  Region A    LONDON            red      Lana   Courtney
2     Abel, Shakti  Region B  New york         yellow      Abel     Shakti
3    Vasu, Imogene  Region C   ATLANTA           blue      Vasu    Imogene
4  Aravind, Shelly  Region D   toronto         purple   Aravind     Shelly


In [30]:
df['Region'] = df['Region'].str.replace('Region ', '')
print(df)
# Returns:
#               Name Region  Location Favorite Color
# 0  Tranter, Melvyn      A   TORONTO        green  
# 1   Lana, Courtney      A    LONDON            red
# 2     Abel, Shakti      B  New york         yellow
# 3    Vasu, Imogene      C   ATLANTA           blue
# 4  Aravind, Shelly      D   toronto       purple 

              Name Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn      A   TORONTO          green   Tranter     Melvyn
1   Lana, Courtney      A    LONDON            red      Lana   Courtney
2     Abel, Shakti      B  New york         yellow      Abel     Shakti
3    Vasu, Imogene      C   ATLANTA           blue      Vasu    Imogene
4  Aravind, Shelly      D   toronto         purple   Aravind     Shelly


In [31]:
df['Location'] = df['Location'].str.title()
print(df)
# Returns:
#     Name    Region  Location Favorite Color
# 0  Tranter, Melvyn  Region A   Toronto    green  
# 1   Lana, Courtney  Region A    London red
# 2     Abel, Shakti  Region B  New York     yellow
# 3    Vasu, Imogene  Region C   Atlanta    blue
# 4  Aravind, Shelly  Region D   Toronto   purple  

              Name Region  Location Favorite Color Last Name First Name
0  Tranter, Melvyn      A   Toronto          green   Tranter     Melvyn
1   Lana, Courtney      A    London            red      Lana   Courtney
2     Abel, Shakti      B  New York         yellow      Abel     Shakti
3    Vasu, Imogene      C   Atlanta           blue      Vasu    Imogene
4  Aravind, Shelly      D   Toronto         purple   Aravind     Shelly


In [33]:
import pandas as pd
import numpy as np
df = pd.DataFrame.from_dict({
'Name': ['Tranter; Melvyn', 'Lana; Courtney', 'Abel; Shakti', 'Vasu; Imogene', 'Johnson; Connor', 'Lee; Connor'],
'Location': ['TORONTO', 'LONDON', 'New york', np.NaN, 'toronto', 'Madrid'],
'Sales': [123, 243, 654, np.NaN, 345, np.NaN]
})

In [34]:
print(df)

              Name  Location  Sales
0  Tranter; Melvyn   TORONTO  123.0
1   Lana; Courtney    LONDON  243.0
2     Abel; Shakti  New york  654.0
3    Vasu; Imogene       NaN    NaN
4  Johnson; Connor   toronto  345.0
5      Lee; Connor    Madrid    NaN


In [36]:
df[['Last Name', 'First Name']] = df['Name'].str.split(';', expand=True)
print(df)

              Name  Location  Sales Last Name First Name
0  Tranter; Melvyn   TORONTO  123.0   Tranter     Melvyn
1   Lana; Courtney    LONDON  243.0      Lana   Courtney
2     Abel; Shakti  New york  654.0      Abel     Shakti
3    Vasu; Imogene       NaN    NaN      Vasu    Imogene
4  Johnson; Connor   toronto  345.0   Johnson     Connor
5      Lee; Connor    Madrid    NaN       Lee     Connor


In [37]:
df = df.drop_duplicates(subset='Name', keep='last')

In [38]:
print(df)

              Name  Location  Sales Last Name First Name
0  Tranter; Melvyn   TORONTO  123.0   Tranter     Melvyn
1   Lana; Courtney    LONDON  243.0      Lana   Courtney
2     Abel; Shakti  New york  654.0      Abel     Shakti
3    Vasu; Imogene       NaN    NaN      Vasu    Imogene
4  Johnson; Connor   toronto  345.0   Johnson     Connor
5      Lee; Connor    Madrid    NaN       Lee     Connor


In [39]:
print(df.isnull().sum() / len(df))

Name          0.000000
Location      0.166667
Sales         0.333333
Last Name     0.000000
First Name    0.000000
dtype: float64
