In [1]:
#### Q1. What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when youmight choose one over the other.

# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

# Example dataset
data = {
    'Color': ['Red', 'Green', 'Blue', 'Green', 'Blue'],
    'Size': ['Small', 'Medium', 'Large', 'Medium', 'Small']
}

# Create DataFrame
df = pd.DataFrame(data)

# Label Encoding for 'Color'
label_encoder = LabelEncoder()
df['Color_Label_Encoded'] = label_encoder.fit_transform(df['Color'])

# Ordinal Encoding for 'Size'
size_mapping = {'Small': 1, 'Medium': 2, 'Large': 3}
ordinal_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])
df['Size_Ordinal_Encoded'] = ordinal_encoder.fit_transform(df[['Size']])

# Display the DataFrame
df



Unnamed: 0,Color,Size,Color_Label_Encoded,Size_Ordinal_Encoded
0,Red,Small,2,0.0
1,Green,Medium,1,1.0
2,Blue,Large,0,2.0
3,Green,Medium,1,1.0
4,Blue,Small,0,0.0


In [2]:
#### Q2. Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in a machine learning project.

import pandas as pd

# Example dataset
data = {
    'Neighborhood': ['A', 'B', 'C', 'A', 'B', 'C', 'A', 'C'],
    'House_Price': [200000, 250000, 300000, 210000, 260000, 310000, 220000, 320000]
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate the mean house price for each neighborhood
mean_price = df.groupby('Neighborhood')['House_Price'].mean()

# Sort the neighborhoods by mean price
sorted_neighborhoods = mean_price.sort_values().index

# Map each neighborhood to an ordinal value
ordinal_mapping = {neighborhood: i for i, neighborhood in enumerate(sorted_neighborhoods, 1)}

# Apply the mapping to the 'Neighborhood' feature
df['Neighborhood_Encoded'] = df['Neighborhood'].map(ordinal_mapping)

# Display the DataFrame
df



Unnamed: 0,Neighborhood,House_Price,Neighborhood_Encoded
0,A,200000,1
1,B,250000,2
2,C,300000,3
3,A,210000,1
4,B,260000,2
5,C,310000,3
6,A,220000,1
7,C,320000,3


In [13]:
#### Q3. Define covariance and explain why it is important in statistical analysis. How is covariance calculated?

import numpy as np
import pandas as pd

# Example dataset
data = {
    'X': [2, 4, 6, 8, 10],
    'Y': [1, 3, 5, 7, 9]
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate the mean of X and Y
mean_X = np.mean(df['X'])
mean_Y = np.mean(df['Y'])

# Calculate the covariance manually
covariance_manual = np.sum((df['X'] - mean_X) * (df['Y'] - mean_Y)) / len(df)

# Calculate the covariance using numpy
covariance_np = np.cov(df['X'], df['Y'])[0, 1]

# Display the results
print(f"Covariance (manual calculation): {covariance_manual}")
print(f"Covariance (numpy calculation): {covariance_np}")


Covariance (manual calculation): 8.0
Covariance (numpy calculation): 10.0


In [14]:
#### Q4. For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium,large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library.Show your code and explain the output.

import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Example dataset
data = {
    'Color': ['red', 'green', 'blue', 'green', 'red'],
    'Size': ['small', 'medium', 'large', 'small', 'large'],
    'Material': ['wood', 'metal', 'plastic', 'metal', 'wood']
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the original DataFrame
print("Original DataFrame:")
print(df)




Original DataFrame:
   Color    Size Material
0    red   small     wood
1  green  medium    metal
2   blue   large  plastic
3  green   small    metal
4    red   large     wood


In [15]:
### Q5. Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education level. Interpret the results.

import numpy as np
import pandas as pd

# Example dataset
data = {
    'Age': [25, 32, 45, 29, 40],
    'Income': [50000, 60000, 80000, 55000, 75000],
    'Education Level': [2, 3, 4, 2, 3]  # 1: High School, 2: Bachelor, 3: Master, 4: PhD
}

# Create DataFrame
df = pd.DataFrame(data)

# Calculate the covariance matrix
cov_matrix = df.cov()

# Display the covariance matrix
cov_matrix


Unnamed: 0,Age,Income,Education Level
Age,66.7,105250.0,6.3
Income,105250.0,167500000.0,9750.0
Education Level,6.3,9750.0,0.7


In [16]:
#### Q6. You are working on a machine learning project with a dataset containing several categorical variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD),and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for each variable, and why?

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Example dataset
data = {
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Education Level': ['Bachelor\'s', 'Master\'s', 'PhD', 'High School', 'Master\'s'],
    'Employment Status': ['Full-Time', 'Part-Time', 'Full-Time', 'Unemployed', 'Part-Time']
}

# Create DataFrame
df = pd.DataFrame(data)

# Initialize OneHotEncoder for Gender and Employment Status
one_hot_encoder = OneHotEncoder(sparse=False, drop='first')

# Fit and transform the categorical columns
gender_encoded = one_hot_encoder.fit_transform(df[['Gender']])
employment_status_encoded = one_hot_encoder.fit_transform(df[['Employment Status']])

# Initialize OrdinalEncoder for Education Level
ordinal_encoder = OrdinalEncoder(categories=[['High School', 'Bachelor\'s', 'Master\'s', 'PhD']])

# Fit and transform the Education Level column
education_level_encoded = ordinal_encoder.fit_transform(df[['Education Level']])

# Create DataFrame for the encoded features
encoded_df = pd.DataFrame(gender_encoded, columns=['Gender_Female'])
encoded_df = encoded_df.join(pd.DataFrame(employment_status_encoded, columns=['Employment Status_Part-Time', 'Employment Status_Full-Time']))
encoded_df['Education Level'] = education_level_encoded

# Display the encoded DataFrame
encoded_df







Unnamed: 0,Gender_Female,Employment Status_Part-Time,Employment Status_Full-Time,Education Level
0,1.0,0.0,0.0,1.0
1,0.0,1.0,0.0,2.0
2,0.0,0.0,0.0,3.0
3,1.0,0.0,1.0,0.0
4,0.0,1.0,0.0,2.0


In [19]:
### Q7. You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/ East/West). Calculate the covariance between each pair of variables and interpret the results.

import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# Example dataset
data = {
    'Temperature': [30, 22, 25, 28, 35],
    'Humidity': [80, 70, 75, 85, 60],
    'Weather Condition': ['Sunny', 'Cloudy', 'Rainy', 'Sunny', 'Cloudy'],
    'Wind Direction': ['North', 'South', 'East', 'West', 'North']
}

# Create DataFrame
df = pd.DataFrame(data)

# Initialize OneHotEncoder for Weather Condition and Wind Direction






ValueError: input_features is not equal to feature_names_in_