In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf

In [None]:
# Q1 
# Handling Missing Values
# Fill missing values in the age column with the mean value of that column

student_data = pd.read_csv('../datasets/student_data - Sheet1.csv')
print('Original data:')
print(student_data)

student_data['age'].fillna(student_data['age'].mean(), inplace=True)
print('\nResult:')
print(student_data)

In [None]:
# Q2
# Removing Duplicate Rows
# Write a python script to remove any duplicate row from student_data

student_data.drop_duplicates(subset=['name'], inplace=True)
print(student_data)

In [None]:
# Q3 
# Converting Data Types
# Convert the price column in the sales csv file from string to float

sales_data = pd.read_csv('../datasets/sales - Sheet1.csv')
print('Original:')
print(sales_data)

sales_data['price'] = sales_data['price'].replace({' USD': ''}, regex=True).astype(float)
print('\nResult:')
print(sales_data)


In [None]:
# Q4
# Renaming Columns
# Rename the columns in the employee csv from ["name", "age", "salary"] to ["employee_name", "employee_age", "employee_salary"]

employee_data = pd.read_csv('../datasets/employee_data - Sheet1.csv')
print("Original:")
print(employee_data)

employee_data.rename(columns={'name': 'employee_name', 'age': 'employee_age', 'salary': 'employee_salary'}, inplace=True)
print('\nResult:')
print(employee_data)

In [None]:
# Q5
# Label Encoding
# Encode the status column of employee_data 
# Active -> 1, Inactive -> 0

employee_data['status'] = employee_data['status'].map({'Active': 1, 'Inactive': 0})
print(employee_data)


In [None]:
# Q6
# Dropping Columns
# Drop the address column from student_data

student_data.drop(columns=['address'], inplace=True)
print(student_data)

In [None]:
# Q7
# Correlation Heatmap Generation for the Titanic Dataset
# Load the Dataset:
# Handle Missing Values:
# Fill missing values in the age column with the median.
# Fill missing values in the embarked column with the mode.
# Use pandas to generate the correlation matrix for the numerical columns in the dataset. 
# Visualize the correlation matrix using a heatmap from seaborn.

titanic_df = sns.load_dataset('titanic')
print(titanic_df)

titanic_df['age'].fillna(titanic_df['age'].median(), inplace=True)
titanic_df['embarked'].fillna(titanic_df['embarked'].mode()[0], inplace=True)
numeric_cols = titanic_df.select_dtypes(include=["number"])
corr_matrix = numeric_cols.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.show()



In [None]:
# Q8
# Add a Normalized Salary Column for employee data
# Write a python script to 
# Read the dataset into a Pandas DataFrame
# Normalize the salary column using the formula:
# Normalized Salary = (salary - Min Salary) / (Max Salary - Min Salary)
# Add normalized salary column

employee_data['normalized_salary'] = (employee_data['employee_salary'] - employee_data['employee_salary'].min()) / (employee_data['employee_salary'].max() - employee_data['employee_salary'].min())
print(employee_data)

In [None]:
# Q9
# Handling Outliers Using Standard Deviation
# Given the employee data that contains salary information.
# Your task is to remove outliers using the standard deviation method (+/- 3).

mean_salary = employee_data['employee_salary'].mean()
std_salary = employee_data['employee_salary'].std()
employee_data_no_outliers = employee_data[
    (employee_data['employee_salary'] >= mean_salary - 3 * std_salary) & 
    (employee_data['employee_salary'] <= mean_salary + 3 * std_salary)]

print(employee_data_no_outliers)

In [None]:
# Q10
# Linear Regression
# Import the iris dataset and perform linear reression to model the 
# relationship between petal length and petal width for the versicolor species.

iris = sns.load_dataset('iris')

versicolor = iris[iris['species'] == 'versicolor']
model = smf.ols(formula="petal_width ~ petal_length", data=versicolor).fit()
print(model.summary)

plt.scatter(
    versicolor["petal_length"],
    versicolor["petal_width"],
    alpha=0.7,
    label="Data points"
)

x_vals = versicolor["petal_length"]
y_vals = model.predict(versicolor)

plt.plot(x_vals, y_vals, color="red", label="Regression line")

plt.xlabel("Petal Length")
plt.ylabel("Petal Width")
plt.title("Linear Regression: Petal Length vs Petal Width (Versicolor)")
plt.legend()
plt.show()
