<a href="https://colab.research.google.com/github/davidofitaly/notes_03_python_in_data_analysis/blob/main/04_data_cleaning_and_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

##Handling missing data

### Handling Missing Data in Pandas

#####Missing data is common in datasets and can be managed using Pandas.

- **Identifying Missing Data**:  
  - `df.isnull()` – Detects missing values.  
  - `df.notna()` – Detects non-missing values.  

- **Removing Missing Data**:  
  - `df.dropna()` – Removes rows or columns with missing values.  

- **Filling Missing Data**:  
  - `df.fillna(value)` – Replaces missing values with a specified value.  
  - `df.fillna(method='ffill')` – Forward fill.  
  - `df.fillna(method='bfill')` – Backward fill.  



####Examples 4.1



*   ex1



In [2]:
# Creating a Pandas Series with numerical values, including missing values (NaN)
df_ex1 = pd.Series([0, 4.5, 10, 2, np.nan, 3, np.nan, 0.5])

# Display the Series
df_ex1

Unnamed: 0,0
0,0.0
1,4.5
2,10.0
3,2.0
4,
5,3.0
6,
7,0.5


In [3]:
df_ex1.isna() # Check for missing values (NaN) in the Series

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True
5,False
6,True
7,False




*   ex2



In [4]:
# Creating a Pandas Series with string values, including missing values (NaN and None)
df_ex2 = pd.Series(['a', 'b', np.nan, 'd', None])

# Display the Series
df_ex2

Unnamed: 0,0
0,a
1,b
2,
3,d
4,


In [5]:
df_ex2.isna() # Check for missing values (NaN and None) in the Series

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,True




*   ex3


In [6]:
# Creating a Pandas Series with mixed data types, including missing values (NaN)
df_ex3 = pd.Series([1, 'text', np.nan, 3.14, None, True, np.nan, 'data'])

# Display the Series
df_ex3

Unnamed: 0,0
0,1
1,text
2,
3,3.14
4,
5,True
6,
7,data


In [7]:
df_ex3.dropna() # Remove rows with missing values (NaN

Unnamed: 0,0
0,1
1,text
3,3.14
5,True
7,data


In [8]:
df_ex3[df_ex3.notna()] # Remove rows with missing values (NaN)

Unnamed: 0,0
0,1
1,text
3,3.14
5,True
7,data




*   ex4




In [39]:
# Creating a Pandas DataFrame with mixed data types, including missing values (NaN)
df_ex4 = pd.DataFrame({
    'A': [1, np.nan, np.nan, 4, None],
    'B': ['apple', np.nan, 'banana', 'cherry', 'date'],
    'C': [True, np.nan, np.nan, True, False],
    'D': [3.5, np.nan, 7.1, 8.2, None]
})

# Display the DataFrame
df_ex4

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
1,,,,
2,,banana,,7.1
3,4.0,cherry,True,8.2
4,,date,False,


In [40]:
df_ex4.dropna() # Remove rows with missing values (NaN)

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
3,4.0,cherry,True,8.2


In [41]:
df_ex4.dropna(how='all') # Remove rows where all values are missing

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
2,,banana,,7.1
3,4.0,cherry,True,8.2
4,,date,False,




*   ex5


In [55]:
# Creating a DataFrame with numbers and NaN values
df_ex5 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, np.nan, np.nan, np.nan, 40],  # Entire column with NaN
    'C': [10, np.nan, 30, 40, 50],
    'D': [np.nan, 20, 30, np.nan, 50]
})

# Display the DataFrame
df_ex5

Unnamed: 0,A,B,C,D
0,1.0,,10.0,
1,2.0,,,20.0
2,,,30.0,30.0
3,4.0,,40.0,
4,5.0,40.0,50.0,50.0


In [56]:
df_ex5.dropna(axis='columns', how='all') # Drops columns where all values are NaN.

Unnamed: 0,A,B,C,D
0,1.0,,10.0,
1,2.0,,,20.0
2,,,30.0,30.0
3,4.0,,40.0,
4,5.0,40.0,50.0,50.0


In [60]:
df_ex5.dropna(thresh=4) # Keep only the rows with at least 4 non-NaN values.

Unnamed: 0,A,B,C,D
4,5.0,40.0,50.0,50.0


In [61]:
df_ex5.fillna(40) # Replace NaN values with 40.

Unnamed: 0,A,B,C,D
0,1.0,40.0,10.0,40.0
1,2.0,40.0,40.0,20.0
2,40.0,40.0,30.0,30.0
3,4.0,40.0,40.0,40.0
4,5.0,40.0,50.0,50.0


In [63]:
df_ex5.fillna({'A': 10, 'B': 20, 'C': 30, 'D':40}) # Fills missing (NaN) values in the specified columns with the given values.

Unnamed: 0,A,B,C,D
0,1.0,20.0,10.0,40.0
1,2.0,20.0,30.0,20.0
2,10.0,20.0,30.0,30.0
3,4.0,20.0,40.0,40.0
4,5.0,40.0,50.0,50.0




*   ex6



In [84]:
# Creates a DataFrame with 7 rows and 5 columns filled with random values sampled from a standard normal distribution.
df_ex6 = pd.DataFrame(np.random.standard_normal((7,5)))

# Display the DataFrame
df_ex6

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-1.426615
1,1.135791,-1.146761,-1.606257,-0.007779,-0.037378
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,-1.599306,-0.380107,-0.445923,-0.397514,-0.963549
4,-0.48962,-0.218279,-0.189766,0.950307,-0.485409
5,1.936404,-0.632051,0.392579,-0.523201,-0.893077
6,0.91623,-1.310023,-1.414701,0.776406,0.897667


In [90]:
# Sets specific values in the DataFrame to NaN (missing values)
df_ex6.iloc[:2, 4] = np.nan
df_ex6.iloc[3:, 0] = np.nan

# Display the DataFrame
df_ex6

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,
1,1.135791,-1.146761,-1.606257,-0.007779,
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,,-0.380107,-0.445923,-0.397514,-0.963549
4,,-0.218279,-0.189766,0.950307,-0.485409
5,,-0.632051,0.392579,-0.523201,-0.893077
6,,-1.310023,-1.414701,0.776406,0.897667


In [93]:
df_ex6.fillna(method='ffill') # Forward fill missing values.

  df_ex6.fillna(method='ffill') # Forward fill missing values.


Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,
1,1.135791,-1.146761,-1.606257,-0.007779,
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,1.142149,-0.380107,-0.445923,-0.397514,-0.963549
4,1.142149,-0.218279,-0.189766,0.950307,-0.485409
5,1.142149,-0.632051,0.392579,-0.523201,-0.893077
6,1.142149,-1.310023,-1.414701,0.776406,0.897667


In [94]:
df_ex6.fillna(method='bfill') # Backward fill

  df_ex6.fillna(method='bfill') # Backward fill


Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-0.866726
1,1.135791,-1.146761,-1.606257,-0.007779,-0.866726
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,,-0.380107,-0.445923,-0.397514,-0.963549
4,,-0.218279,-0.189766,0.950307,-0.485409
5,,-0.632051,0.392579,-0.523201,-0.893077
6,,-1.310023,-1.414701,0.776406,0.897667


In [95]:
# Fill missing values (NaN) in the DataFrame with the mean of each column.
df_ex6.fillna(df_ex6.mean())

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-0.462219
1,1.135791,-1.146761,-1.606257,-0.007779,-0.462219
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,0.7404,-0.380107,-0.445923,-0.397514,-0.963549
4,0.7404,-0.218279,-0.189766,0.950307,-0.485409
5,0.7404,-0.632051,0.392579,-0.523201,-0.893077
6,0.7404,-1.310023,-1.414701,0.776406,0.897667
