<a href="https://colab.research.google.com/github/davidofitaly/notes_03_python_in_data_analysis/blob/main/04_data_cleaning_and_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

##Handling missing data

### Handling Missing Data in Pandas

#####Missing data is common in datasets and can be managed using Pandas.

- **Identifying Missing Data**:  
  - `df.isnull()` – Detects missing values.  
  - `df.notna()` – Detects non-missing values.  

- **Removing Missing Data**:  
  - `df.dropna()` – Removes rows or columns with missing values.  

- **Filling Missing Data**:  
  - `df.fillna(value)` – Replaces missing values with a specified value.  
  - `df.fillna(method='ffill')` – Forward fill.  
  - `df.fillna(method='bfill')` – Backward fill.  



####Examples 4.1



*   ex1



In [None]:
# Creating a Pandas Series with numerical values, including missing values (NaN)
df_ex1 = pd.Series([0, 4.5, 10, 2, np.nan, 3, np.nan, 0.5])

# Display the Series
df_ex1

Unnamed: 0,0
0,0.0
1,4.5
2,10.0
3,2.0
4,
5,3.0
6,
7,0.5


In [None]:
df_ex1.isna() # Check for missing values (NaN) in the Series

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,True
5,False
6,True
7,False




*   ex2



In [None]:
# Creating a Pandas Series with string values, including missing values (NaN and None)
df_ex2 = pd.Series(['a', 'b', np.nan, 'd', None])

# Display the Series
df_ex2

Unnamed: 0,0
0,a
1,b
2,
3,d
4,


In [None]:
df_ex2.isna() # Check for missing values (NaN and None) in the Series

Unnamed: 0,0
0,False
1,False
2,True
3,False
4,True




*   ex3


In [None]:
# Creating a Pandas Series with mixed data types, including missing values (NaN)
df_ex3 = pd.Series([1, 'text', np.nan, 3.14, None, True, np.nan, 'data'])

# Display the Series
df_ex3

Unnamed: 0,0
0,1
1,text
2,
3,3.14
4,
5,True
6,
7,data


In [None]:
df_ex3.dropna() # Remove rows with missing values (NaN

Unnamed: 0,0
0,1
1,text
3,3.14
5,True
7,data


In [None]:
df_ex3[df_ex3.notna()] # Remove rows with missing values (NaN)

Unnamed: 0,0
0,1
1,text
3,3.14
5,True
7,data




*   ex4




In [None]:
# Creating a Pandas DataFrame with mixed data types, including missing values (NaN)
df_ex4 = pd.DataFrame({
    'A': [1, np.nan, np.nan, 4, None],
    'B': ['apple', np.nan, 'banana', 'cherry', 'date'],
    'C': [True, np.nan, np.nan, True, False],
    'D': [3.5, np.nan, 7.1, 8.2, None]
})

# Display the DataFrame
df_ex4

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
1,,,,
2,,banana,,7.1
3,4.0,cherry,True,8.2
4,,date,False,


In [None]:
df_ex4.dropna() # Remove rows with missing values (NaN)

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
3,4.0,cherry,True,8.2


In [None]:
df_ex4.dropna(how='all') # Remove rows where all values are missing

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
2,,banana,,7.1
3,4.0,cherry,True,8.2
4,,date,False,


In [None]:
df_ex4.replace([np.nan, None], 0) # Replace missing values (NaN and None) with 0

Unnamed: 0,A,B,C,D
0,1.0,apple,True,3.5
1,0.0,0,0,0.0
2,0.0,banana,0,7.1
3,4.0,cherry,True,8.2
4,0.0,date,False,0.0




*   ex5


In [None]:
# Creating a DataFrame with numbers and NaN values
df_ex5 = pd.DataFrame({
    'A': [1, 2, np.nan, 4, 5],
    'B': [np.nan, np.nan, np.nan, np.nan, 40],  # Entire column with NaN
    'C': [10, np.nan, 30, 40, 50],
    'D': [np.nan, 20, 30, np.nan, 50]
})

# Display the DataFrame
df_ex5

Unnamed: 0,A,B,C,D
0,1.0,,10.0,
1,2.0,,,20.0
2,,,30.0,30.0
3,4.0,,40.0,
4,5.0,40.0,50.0,50.0


In [None]:
df_ex5.dropna(axis='columns', how='all') # Drops columns where all values are NaN

Unnamed: 0,A,B,C,D
0,1.0,,10.0,
1,2.0,,,20.0
2,,,30.0,30.0
3,4.0,,40.0,
4,5.0,40.0,50.0,50.0


In [None]:
df_ex5.dropna(thresh=4) # Keep only the rows with at least 4 non-NaN values

Unnamed: 0,A,B,C,D
4,5.0,40.0,50.0,50.0


In [None]:
df_ex5.fillna(40) # Replace NaN values with 40

Unnamed: 0,A,B,C,D
0,1.0,40.0,10.0,40.0
1,2.0,40.0,40.0,20.0
2,40.0,40.0,30.0,30.0
3,4.0,40.0,40.0,40.0
4,5.0,40.0,50.0,50.0


In [None]:
df_ex5.fillna({'A': 10, 'B': 20, 'C': 30, 'D':40}) # Fills missing (NaN) values in the specified columns with the given values

Unnamed: 0,A,B,C,D
0,1.0,20.0,10.0,40.0
1,2.0,20.0,30.0,20.0
2,10.0,20.0,30.0,30.0
3,4.0,20.0,40.0,40.0
4,5.0,40.0,50.0,50.0


In [None]:
df_ex5.replace({np.nan: 0}) # Replace NaN values with 0

Unnamed: 0,A,B,C,D
0,1.0,0.0,10.0,0.0
1,2.0,0.0,0.0,20.0
2,0.0,0.0,30.0,30.0
3,4.0,0.0,40.0,0.0
4,5.0,40.0,50.0,50.0




*   ex6



In [None]:
# Creates a DataFrame with 7 rows and 5 columns filled with random values sampled from a standard normal distribution
df_ex6 = pd.DataFrame(np.random.standard_normal((7,5)))

# Display the DataFrame
df_ex6

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-1.426615
1,1.135791,-1.146761,-1.606257,-0.007779,-0.037378
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,-1.599306,-0.380107,-0.445923,-0.397514,-0.963549
4,-0.48962,-0.218279,-0.189766,0.950307,-0.485409
5,1.936404,-0.632051,0.392579,-0.523201,-0.893077
6,0.91623,-1.310023,-1.414701,0.776406,0.897667


In [None]:
# Sets specific values in the DataFrame to NaN (missing values)
df_ex6.iloc[:2, 4] = np.nan
df_ex6.iloc[3:, 0] = np.nan

# Display the DataFrame
df_ex6

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,
1,1.135791,-1.146761,-1.606257,-0.007779,
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,,-0.380107,-0.445923,-0.397514,-0.963549
4,,-0.218279,-0.189766,0.950307,-0.485409
5,,-0.632051,0.392579,-0.523201,-0.893077
6,,-1.310023,-1.414701,0.776406,0.897667


In [None]:
df_ex6.fillna(method='ffill') # Forward fill missing values

  df_ex6.fillna(method='ffill') # Forward fill missing values.


Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,
1,1.135791,-1.146761,-1.606257,-0.007779,
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,1.142149,-0.380107,-0.445923,-0.397514,-0.963549
4,1.142149,-0.218279,-0.189766,0.950307,-0.485409
5,1.142149,-0.632051,0.392579,-0.523201,-0.893077
6,1.142149,-1.310023,-1.414701,0.776406,0.897667


In [None]:
df_ex6.fillna(method='bfill') # Backward fill

  df_ex6.fillna(method='bfill') # Backward fill


Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-0.866726
1,1.135791,-1.146761,-1.606257,-0.007779,-0.866726
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,,-0.380107,-0.445923,-0.397514,-0.963549
4,,-0.218279,-0.189766,0.950307,-0.485409
5,,-0.632051,0.392579,-0.523201,-0.893077
6,,-1.310023,-1.414701,0.776406,0.897667


In [None]:
# Fill missing values (NaN) in the DataFrame with the mean of each column
df_ex6.fillna(df_ex6.mean())

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,-0.462219
1,1.135791,-1.146761,-1.606257,-0.007779,-0.462219
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,0.7404,-0.380107,-0.445923,-0.397514,-0.963549
4,0.7404,-0.218279,-0.189766,0.950307,-0.485409
5,0.7404,-0.632051,0.392579,-0.523201,-0.893077
6,0.7404,-1.310023,-1.414701,0.776406,0.897667


###Data transformation

In [None]:
df_ex6.replace(np.nan, 0.5) # Replace NaN values with 0.5

Unnamed: 0,0,1,2,3,4
0,-0.056739,-0.794713,1.294369,0.911241,0.5
1,1.135791,-1.146761,-1.606257,-0.007779,0.5
2,1.142149,0.17147,0.517921,-0.902466,-0.866726
3,0.5,-0.380107,-0.445923,-0.397514,-0.963549
4,0.5,-0.218279,-0.189766,0.950307,-0.485409
5,0.5,-0.632051,0.392579,-0.523201,-0.893077
6,0.5,-1.310023,-1.414701,0.776406,0.897667


###Removing Duplicates

#####In pandas, duplicate values in a DataFrame can be identified and removed using the `duplicated()` and `drop_duplicates()` methods.

- **`duplicated()`**: Returns a boolean Series indicating whether each row is a duplicate (excluding the first occurrence).
- **`drop_duplicates()`**: Removes duplicate rows from the DataFrame. By default, it keeps the first occurrence of each duplicated row.

##### Key Parameters:
- **`subset`**: Specify columns to consider when identifying duplicates.
- **`keep`**:
  - `'first'`: Keep the first occurrence (default).
  - `'last'`: Keep the last occurrence.
  - `False`: Remove all duplicates.

##### Example Usage:
- **Identify duplicates**: `df.duplicated()`
- **Remove duplicates**: `df.drop_duplicates()`


####Examples 4.2



*   ex7



In [None]:
# Create a DataFrame with two columns and some duplicate values
data = {
    'A1': [1, 2, 3, 4, 5, 2, 3],
    'A2': ['a', 'b', 'c', 'd', 'e', 'b', 'c']
}

df_ex7 = pd.DataFrame(data)

# Display the DataFrame
df_ex7


Unnamed: 0,A1,A2
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e
5,2,b
6,3,c


In [None]:
df_ex7.duplicated() # Check for duplicate rows in the DataFrame

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
5,True
6,True


In [None]:
df_ex7.drop_duplicates() # Remove duplicate rows from the DataFrame

Unnamed: 0,A1,A2
0,1,a
1,2,b
2,3,c
3,4,d
4,5,e




* ex8



In [None]:
# Create a DataFrame with three columns
data = {
    'Word': ['five', 'four', 'six', 'five', 'four', 'six', 'five'],
    'Number1': [10, 20, 30, 10, 20, 30, 10],
    'Number2': [100, 200, 300, 100, 200, 300, 100]
}

df_ex8 = pd.DataFrame(data)

# Display the DataFrame
df_ex8


Unnamed: 0,Word,Number1,Number2
0,five,10,100
1,four,20,200
2,six,30,300
3,five,10,100
4,four,20,200
5,six,30,300
6,five,10,100


In [None]:
df_ex8.drop_duplicates(subset=['Word']) # Remove duplicate rows based on the 'Word' column

Unnamed: 0,Word,Number1,Number2
0,five,10,100
1,four,20,200
2,six,30,300


In [None]:
df_ex8.drop_duplicates(subset=['Number1', 'Number2'], keep='last') # Remove duplicate rows based on the 'Number1' column

Unnamed: 0,Word,Number1,Number2
4,four,20,200
5,six,30,300
6,five,10,100


###Transforming Data Using Functions or Mapping



Pandas allows transforming data using custom functions or mappings with the `apply()`, `map()`, and `applymap()` methods.

- **`apply()`**: Applies a function along an axis (rows or columns) of a DataFrame or Series. It can be used to transform or aggregate data.
  - Example: `df['column'].apply(lambda x: x * 2)` applies the function to each element in the specified column.
  
- **`map()`**: Used for element-wise transformations, particularly with Series. It allows mapping each element to a value from a dictionary, Series, or function.
  - Example: `df['column'].map({'A': 1, 'B': 2})` maps values in the column using the provided dictionary.
  
- **`applymap()`**: Applies a function element-wise to an entire DataFrame. It is useful for transforming all values in the DataFrame.
  - Example: `df.applymap(lambda x: x * 2)` multiplies each element in the DataFrame by 2.

##### Key Points:
- **`apply()`**: Works with both Series and DataFrame.
- **`map()`**: Works only with Series.
- **`applymap()`**: Works only with DataFrame (not Series).


####Examples 4.3

In [None]:
# Creating a DataFrame with computer hardware items and their prices
data = {
    'item': ['laptop', 'monitor', 'keyboard', 'mouse', 'headphones'],
    'price': [1200, 250, 80, 25, 100]
}

df_computer_hardware = pd.DataFrame(data)

# Display the DataFrame
df_computer_hardware

Unnamed: 0,item,price
0,laptop,1200
1,monitor,250
2,keyboard,80
3,mouse,25
4,headphones,100


In [None]:
# Creating a dictionary to map products to their companies
product_to_company = {
    'laptop': 'Dell',
    'monitor': 'Samsung',
    'keyboard': 'Logitech',
    'mouse': 'Razer',
    'headphones': 'Sony'
}

# Display the DataFrame
product_to_company

{'laptop': 'Dell',
 'monitor': 'Samsung',
 'keyboard': 'Logitech',
 'mouse': 'Razer',
 'headphones': 'Sony'}

In [None]:
# Maps the values from the 'item' column to the corresponding companies from the 'product_to_company' dictionary
df_computer_hardware['company'] = df_computer_hardware['item'].map(product_to_company)

# Display the DataFrame
df_computer_hardware

Unnamed: 0,item,price,company
0,laptop,1200,Dell
1,monitor,250,Samsung
2,keyboard,80,Logitech
3,mouse,25,Razer
4,headphones,100,Sony


### Discretization and Binning (Bucketization)


#####Discretization is the process of converting continuous data into discrete intervals or bins. This is useful for categorizing continuous values into groups, often to simplify analysis or meet specific requirements.

##### **pd.cut()**

`pd.cut()` is used to segment and sort data values into discrete bins or intervals. It is useful when you want to categorize continuous values into a fixed number of bins.

- **Parameters**:
  - `x`: The data to be binned (usually a series or array).
  - `bins`: The number of bins or an array of bin edges.
  - `right`: Whether bins include the rightmost edge (default is `True`).
  - `labels`: Labels for the resulting bins.

##### **pd.qcut()**

`pd.qcut()` is similar to `pd.cut()`, but instead of defining the bins explicitly, it divides the data into equal-sized quantiles, ensuring that each bin contains approximately the same number of data points.

- **Parameters**:
  - `x`: The data to be binned.
  - `q`: The number of quantiles or bins.
  - `labels`: Labels for the resulting quantiles.
  - `precision`: The precision to use when rounding bin edges.


####Examples 4.4



*   ex1



In [None]:
# Creating a list of heights for 20 individuals
heights = np.random.randint(150, 190, size=20)

# Create a DataFrame
df_heights = pd.DataFrame(heights, columns=["Height"])
df_heights = df_heights.sort_values(by="Height")
df_heights = df_heights.reset_index(drop=True)

# Display the DataFrame
df_heights

Unnamed: 0,Height
0,152
1,153
2,156
3,161
4,165
5,166
6,168
7,168
8,168
9,168


In [None]:
# The code categorizes the 'Height' column from the df_heights DataFrame into specific bins.
heights_categories = pd.cut(df_heights['Height'], bins=[150, 165, 180, 190])

# Display result
heights_categories

Unnamed: 0,Height
0,"(150, 165]"
1,"(150, 165]"
2,"(150, 165]"
3,"(150, 165]"
4,"(150, 165]"
5,"(165, 180]"
6,"(165, 180]"
7,"(165, 180]"
8,"(165, 180]"
9,"(165, 180]"


In [None]:
# The code categorizes the 'Height' column into 3 equal-width bins labeled 'Low', 'Medium', and 'High'
heights_categories = pd.cut(df_heights['Height'], bins=3, labels=['Low', 'Medium', 'High'])

# Display result
heights_categories

Unnamed: 0,Height
0,Low
1,Low
2,Low
3,Low
4,Medium
5,Medium
6,Medium
7,Medium
8,Medium
9,Medium




*   ex2



In [None]:
# Creating a list of heights for 20 individuals
heights = np.random.randint(150, 190, size=20)

# Display result
heights

array([177, 167, 167, 161, 176, 185, 162, 176, 160, 171, 166, 189, 166,
       155, 171, 184, 160, 176, 185, 165])

In [None]:
# The code categorizes the 'heights' values into custom bins
bins = [150, 165, 180, 190]

heights_categories = pd.cut(heights, bins=bins)

# Display result
heights_categories

[(165.0, 180.0], (180.0, 190.0], (165.0, 180.0], NaN, (180.0, 190.0], ..., (180, 190], (150, 165], (165, 180], (165, 180], (150, 165]]
Length: 20
Categories (3, interval[int64, right]): [(150, 165] < (165, 180] < (180, 190]]

In [None]:
# 'heights_categories.categories' displays the bin intervals that were defined for categorizing the height data.
heights_categories.categories

IntervalIndex([(150, 165], (165, 180], (180, 190]], dtype='interval[int64, right]')

In [None]:
# 'heights_categories.codes' shows the integer codes corresponding to the categories assigned to each height value.
heights_categories.codes

array([1, 2, 2, 1, 1, 2, 2, 1, 1, 1, 0, 2, 1, 0, 0, 0, 0, 1, 0, 1],
      dtype=int8)

In [None]:
# 'heights_categories.value_counts()' returns the count of values in each category (bin) of the 'heights_categories' series.
heights_categories.value_counts()

Unnamed: 0,count
"(150, 165]",6
"(165, 180]",10
"(180, 190]",4




*   ex3



In [None]:
# Create an array of 20 random heights between 150 and 190
heights = np.random.randint(150, 190, size=20)

# Define the bins for height categories
bins = [150, 165, 180, 190]

# Define the labels for each bin
labels = ['Short', 'Average', 'Tall']

# Categorize the heights into the defined bins with labels
heights_categories = pd.cut(heights, bins=bins, labels=labels)

# Display the height categories
heights_categories

['Short', 'Average', 'Tall', 'Average', 'Average', ..., 'Average', 'Average', 'Short', 'Average', 'Average']
Length: 20
Categories (3, object): ['Short' < 'Average' < 'Tall']



*  ex4



In [None]:
# Generate an array of 20 random numbers from a standard normal distribution
data = np.random.standard_normal(20)

# Display the generated data
data

array([ 0.90884137, -0.65827838, -0.73192194, -1.01203194, -0.35396679,
        0.13215296,  1.92293834,  0.4417833 , -1.41243866, -0.03497169,
       -0.26140643, -1.78419388,  0.33485956, -2.45263012, -0.24495673,
       -1.01779099,  0.67469098, -0.81149563, -0.50845139, -0.90094009])

In [None]:
# Divide the data into 4 equal-width bins and round the bin edges to 2 decimal places
data_categories = pd.cut(data, 4, precision=2)

# Display the resulting categories
data_categories


[(0.83, 1.92], (-1.36, -0.26], (-1.36, -0.26], (-1.36, -0.26], (-1.36, -0.26], ..., (-1.36, -0.26], (-0.26, 0.83], (-1.36, -0.26], (-1.36, -0.26], (-1.36, -0.26]]
Length: 20
Categories (4, interval[float64, right]): [(-2.46, -1.36] < (-1.36, -0.26] < (-0.26, 0.83] <
                                           (0.83, 1.92]]



*   ex5


In [None]:
# Generate an array of 500 random numbers from a standard normal distribution
data = np.random.standard_normal(500)

# Display the first 10 values and the last 10 values from the generated data
data[:10], data[-10:]


(array([-0.13204241, -1.21562581, -1.67226918,  0.65033791, -0.72098889,
        -0.15254648,  0.12998908, -0.25866841, -0.94004401,  0.85793963]),
 array([ 1.09132439, -1.51092229,  0.26452041, -1.45958656,  0.50090436,
        -1.05473051, -0.18809185,  2.13439433,  0.70444244,  0.47667429]))

In [None]:
# Divide the data into 5 equal-sized quantiles, with precision up to 2 decimal places
quantiles = pd.qcut(data, 5, precision=2)

# Display the quantiles
quantiles


[(-0.22, 0.32], (-3.0599999999999996, -0.83], (-3.0599999999999996, -0.83], (0.32, 0.86], (-0.83, -0.22], ..., (-3.0599999999999996, -0.83], (-0.22, 0.32], (0.86, 3.01], (0.32, 0.86], (0.32, 0.86]]
Length: 500
Categories (5, interval[float64, right]): [(-3.0599999999999996, -0.83] < (-0.83, -0.22] < (-0.22, 0.32] <
                                           (0.32, 0.86] < (0.86, 3.01]]

In [None]:
# Count how many values fall into each quantile
quantiles.value_counts()

Unnamed: 0,count
"(-3.0599999999999996, -0.83]",100
"(-0.83, -0.22]",100
"(-0.22, 0.32]",100
"(0.32, 0.86]",100
"(0.86, 3.01]",100


### Outlier Detection and Filtering in Pandas




#####Outliers are data points that differ significantly from other observations and can distort statistical analyses and models. Below are common methods for detecting and handling outliers using Pandas:

1. **Z-score Method**:
   - The Z-score represents how many standard deviations a data point is away from the mean.
   - A Z-score greater than 3 or less than -3 typically indicates an outlier.

2. **Interquartile Range (IQR) Method**:
   - The IQR is the range between the 25th percentile (Q1) and the 75th percentile (Q3).
   - Outliers are defined as values that fall outside the range of $[Q1 - 1.5 \times IQR, Q3 + 1.5 \times IQR]$.

3. **Pandas `clip()` Method**:
   - The `clip()` function limits values in a DataFrame to a specified range.
   - Any values outside the range are replaced by the nearest boundary.

4. **Conditional Filtering**:
   - This method allows you to filter out outliers based on custom thresholds.
   - Values that exceed a specified threshold can be removed or replaced.



####Examples 4.5

In [None]:
# Create a DataFrame with 1000 rows and 4 columns ('A', 'B', 'C', 'D') filled with random values from a standard normal distribution
df = pd.DataFrame(np.random.standard_normal((1000, 4)), columns =['A', 'B', 'C', 'D'])

# Generate descriptive statistics for the DataFrame, such as count, mean, std, min, 25%, 50%, 75%, max
df.describe()


Unnamed: 0,A,B,C,D
count,1000.0,1000.0,1000.0,1000.0
mean,0.047906,-0.01866,0.009349,-0.022402
std,1.008438,1.011312,1.024456,0.98458
min,-3.424035,-2.499767,-3.546515,-2.977422
25%,-0.558361,-0.71312,-0.647869,-0.685612
50%,0.037775,-0.037558,0.020653,-0.00455
75%,0.716861,0.616328,0.717865,0.631333
max,4.003955,3.663159,3.210987,2.748408




*   Z-score


In [None]:
# Apply Z-score function to all columns
df = df.apply(zscore)

# Filtering rows where any Z-score in the column is greater than 3
df_outliers = df[(df.abs() > 3).any(axis=1)]  # Keep rows with outliers (Z-score > 3)

# Displaying all columns with the outlier values
df_outliers


Unnamed: 0,A,B,C,D
192,1.720137,-3.286746,-0.315657,1.624774
452,-0.179672,-0.366816,-1.053859,3.57725
543,-3.117449,-1.060443,0.097131,0.767447
571,3.41482,-1.078144,-0.377915,-0.98843
586,-1.32751,3.300981,-0.94794,0.109019
662,-2.187964,-1.041919,-0.403505,3.090369
689,-0.838746,0.112426,1.660566,3.009714
942,-0.243911,3.118255,0.10995,-1.0391
952,-0.787894,4.722771,-0.37038,1.313441


In [None]:
# Dropping rows with outliers using their index
df_cleaned = df.drop(df_outliers.index)

# Displaying the cleaned DataFrame
df_cleaned

Unnamed: 0,A,B,C,D
0,-0.944275,-0.557530,0.538362,0.648913
1,-0.629088,-1.580569,-0.589062,-0.348610
2,1.008444,0.020155,-0.239278,0.009866
3,0.585768,-0.291625,-0.079044,-2.021050
4,-0.454586,0.569809,-0.895696,0.533920
...,...,...,...,...
995,0.548682,-0.750385,1.375889,0.317187
996,1.314370,0.685062,-1.561037,-0.284411
997,-0.852938,-0.001205,2.362644,0.164688
998,-0.550651,1.520661,-0.139295,1.175557




*  Interquartile Range (IQR) Method


In [None]:
# Calculating the IQR (Interquartile Range)
Q1 = df.quantile(0.25)  # 25th percentile (Q1)
Q3 = df.quantile(0.75)  # 75th percentile (Q3)
IQR = Q3 - Q1  # Interquartile range (IQR)

# Identifying outliers using the IQR method (values outside 1.5 * IQR from Q1 and Q3)
df_outliers_iqr = df[((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Displaying the rows with outliers
df_outliers_iqr

Unnamed: 0,A,B,C,D
57,-1.118218,0.115362,-0.314615,-2.653915
66,-1.585358,0.018277,0.835698,-2.895488
112,-0.040557,-1.263293,-0.202257,-2.831411
149,-1.268101,2.157874,-2.701661,-0.288406
175,1.111288,-0.011938,2.933124,-0.527343
192,1.720137,-3.286746,-0.315657,1.624774
215,-0.656232,-0.638431,-2.990888,0.384048
221,2.881462,0.785899,0.280357,0.163209
252,-0.732792,0.173135,0.557727,2.540374
314,2.89307,0.772861,0.644453,0.031886


In [None]:
# Dropping rows with outliers using their index
df_cleaned = df.drop(df_outliers_iqr.index)

# Displaying the cleaned DataFrame
df_cleaned

Unnamed: 0,A,B,C,D
0,-0.944275,-0.557530,0.538362,0.648913
1,-0.629088,-1.580569,-0.589062,-0.348610
2,1.008444,0.020155,-0.239278,0.009866
3,0.585768,-0.291625,-0.079044,-2.021050
4,-0.454586,0.569809,-0.895696,0.533920
...,...,...,...,...
995,0.548682,-0.750385,1.375889,0.317187
996,1.314370,0.685062,-1.561037,-0.284411
997,-0.852938,-0.001205,2.362644,0.164688
998,-0.550651,1.520661,-0.139295,1.175557
