# Part 1: Importing Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Part 2: Handling Missing Values
# We'll demonstrate handling missing values by imputing them with the mean

In [2]:
# Create a sample DataFrame with missing values
data = {'A': [1, 2, np.nan, 4], 'B': [np.nan, 2, 3, 4]}
df = pd.DataFrame(data)

In [3]:
# Display the DataFrame with missing values
print("Original DataFrame with Missing Values:")
print(df)

Original DataFrame with Missing Values:
     A    B
0  1.0  NaN
1  2.0  2.0
2  NaN  3.0
3  4.0  4.0


In [4]:
# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

In [5]:
# Display the DataFrame after imputation
print("\nDataFrame after Imputation (Missing Values Filled):")
print(df_imputed)


DataFrame after Imputation (Missing Values Filled):
          A    B
0  1.000000  3.0
1  2.000000  2.0
2  2.333333  3.0
3  4.000000  4.0


# Part 3: Removing Duplicate Rows

In [6]:
# Create a DataFrame with duplicate rows
data_with_duplicates = {'Name': ['Alice', 'Bob', 'Alice', 'Charlie'],
                        'Age': [25, 30, 25, 35]}
df_duplicates = pd.DataFrame(data_with_duplicates)

In [7]:
# Display the original DataFrame with duplicates
print("\nOriginal DataFrame with Duplicates:")
print(df_duplicates)


Original DataFrame with Duplicates:
      Name  Age
0    Alice   25
1      Bob   30
2    Alice   25
3  Charlie   35


In [8]:
# Removing duplicates based on the 'Name' column
df_no_duplicates = df_duplicates.drop_duplicates(subset='Name')

In [9]:
# Display the DataFrame after removing duplicates
print("\nDataFrame After Removing Duplicates:")
print(df_no_duplicates)


DataFrame After Removing Duplicates:
      Name  Age
0    Alice   25
1      Bob   30
3  Charlie   35


# Part 4: Converting Data Types

In [10]:
# Sample DataFrame with date strings
data_with_dates = {'Date': ['2020-01-01', '2020-02-01', '2020-03-01']}
df_dates = pd.DataFrame(data_with_dates)

In [11]:
# Display the DataFrame with dates as strings
print("\nOriginal DataFrame with Date Strings:")
print(df_dates)


Original DataFrame with Date Strings:
         Date
0  2020-01-01
1  2020-02-01
2  2020-03-01


In [12]:
# Converting the 'Date' column from string to datetime
df_dates['Date'] = pd.to_datetime(df_dates['Date'])

In [13]:
# Display the DataFrame after conversion
print("\nDataFrame After Converting to Datetime:")
print(df_dates)


DataFrame After Converting to Datetime:
        Date
0 2020-01-01
1 2020-02-01
2 2020-03-01


# Part 5: Handling Outliers Using IQR

In [14]:
# Sample DataFrame with outliers
data_with_outliers = {'Score': [50, 60, 70, 80, 200]}
df_outliers = pd.DataFrame(data_with_outliers)

In [15]:
# Calculate the IQR (Interquartile Range)
Q1 = df_outliers['Score'].quantile(0.25)
Q3 = df_outliers['Score'].quantile(0.75)
IQR = Q3 - Q1

In [16]:
# Removing outliers beyond 1.5 * IQR
df_no_outliers = df_outliers[(df_outliers['Score'] >= (Q1 - 1.5 * IQR)) &
                              (df_outliers['Score'] <= (Q3 + 1.5 * IQR))]

In [17]:
# Display the DataFrame after removing outliers
print("\nDataFrame After Removing Outliers (IQR-based):")
print(df_no_outliers)


DataFrame After Removing Outliers (IQR-based):
   Score
0     50
1     60
2     70
3     80


# Part 6: One-Hot Encoding Categorical Variables

In [18]:
# Sample DataFrame with categorical data
data_with_categories = {'Category': ['A', 'B', 'A', 'C', 'B']}
df_categories = pd.DataFrame(data_with_categories)

In [None]:
# One-hot encode the 'Category' column
df_encoded = pd.get_dummies(df_categories, columns=['Category']) 

In [21]:
# Display the DataFrame after encoding
print("\nDataFrame After One-Hot Encoding:")
print(df_encoded)


DataFrame After One-Hot Encoding:
   Category_A  Category_B  Category_C
0        True       False       False
1       False        True       False
2        True       False       False
3       False       False        True
4       False        True       False


# Part 7: Normalizing and Standardizing Data

In [22]:
# Sample DataFrame with features to scale
data_to_scale = {'Feature1': [10, 20, 30, 40, 50], 'Feature2': [100, 200, 300, 400, 500]}
df_scale = pd.DataFrame(data_to_scale)

In [23]:
# Normalizing the data using Min-Max Scaling
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df_scale), columns=df_scale.columns)

In [24]:
# Display the normalized DataFrame
print("\nDataFrame After Normalization (Min-Max Scaling):")
print(df_normalized)


DataFrame After Normalization (Min-Max Scaling):
   Feature1  Feature2
0      0.00      0.00
1      0.25      0.25
2      0.50      0.50
3      0.75      0.75
4      1.00      1.00


In [25]:
# Standardizing the data using Z-score Scaling
scaler_standard = StandardScaler()
df_standardized = pd.DataFrame(scaler_standard.fit_transform(df_scale), columns=df_scale.columns)

In [26]:
# Display the standardized DataFrame
print("\nDataFrame After Standardization (Z-score Scaling):")
print(df_standardized)


DataFrame After Standardization (Z-score Scaling):
   Feature1  Feature2
0 -1.414214 -1.414214
1 -0.707107 -0.707107
2  0.000000  0.000000
3  0.707107  0.707107
4  1.414214  1.414214


# Part 8: Dropping Irrelevant Columns

In [27]:
# Sample DataFrame with irrelevant columns
data_with_irrelevant_columns = {'Name': ['Alice', 'Bob', 'Charlie'], 'Age': [25, 30, 35], 'Salary': [50000, 60000, 70000]}
df_irrelevant = pd.DataFrame(data_with_irrelevant_columns)

In [28]:
# Display the original DataFrame
print("\nOriginal DataFrame with Irrelevant Columns:")
print(df_irrelevant)


Original DataFrame with Irrelevant Columns:
      Name  Age  Salary
0    Alice   25   50000
1      Bob   30   60000
2  Charlie   35   70000


In [None]:
# Dropping the 'Salary' column
df_cleaned = df_irrelevant.drop(columns=['Salary']) #specifying the column to be dropped

In [30]:
# Display the cleaned DataFrame
print("\nDataFrame After Dropping Irrelevant Columns:")
print(df_cleaned)


DataFrame After Dropping Irrelevant Columns:
      Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35
