## Cleaning not-null Values with Pandas

In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [20, 30, 24, 290, 25],
})
df # Problem is the question mark, it is invalid for Sex column also 290 is not a valid age!!!

Unnamed: 0,Sex,Age
0,M,20
1,F,30
2,F,24
3,D,290
4,?,25


### Finding Unique Values


In [9]:
df

Unnamed: 0,Sex,Age
0,M,20
1,F,30
2,F,24
3,D,290
4,?,25


In [6]:
df['Sex'].unique() # Returns all elements once 

array(['M', 'F', 'D', '?'], dtype=object)

In [8]:
df['Sex'].value_counts() # Returns which element has been written how many times

F    2
?    1
D    1
M    1
Name: Sex, dtype: int64

In [11]:
# Since D is a typo, replace it with F
df['Sex'].replace('D', 'F') # Replaces D with F

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [12]:
df['Sex'].replace({'D': 'F', 'N': 'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [13]:
df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

Unnamed: 0,Sex,Age
0,M,20
1,F,30
2,F,24
3,F,29
4,?,25


In [14]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [18]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10
df

Unnamed: 0,Sex,Age
0,M,20.0
1,F,30.0
2,F,24.0
3,D,29.0
4,?,25.0


### Duplicates

In [19]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [20]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [21]:
ambassadors.duplicated() # Returns True if the value is duplicated

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [22]:
ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

### Duplicates in DataFrames

In [23]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [25]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [27]:
players.duplicated(subset=['Name']) # We need to find duplicated in which column

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [29]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [30]:
players.drop_duplicates(subset = ['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


### Splitting Columns

In [31]:
df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [32]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [33]:
# Splits the values in the spesified column according to '...'
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [34]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [35]:
df = df['Data'].str.split('_', expand=True)

In [38]:
# Changes the name of the columns -it should be as the same number of the original columns otherwise it will not work-
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [39]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2
