## 1. Load the dataset and display the first 5 rows.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('salary_dataset.csv')
df.head()

## 2. Display the summary statistics of the dataset.

In [None]:
df.describe()

## 3. Check for missing values in the dataset.

In [None]:
df.isnull().sum()

## 4. Drop rows with any missing values.

In [None]:
df.dropna(inplace=True)
df.head()

## 5. Fill missing values with the mean for numerical columns.

In [None]:
df.fillna(df.mean(), inplace=True)
df.head()

## 6. Fill missing values with the mode for categorical columns.

In [None]:
df.fillna(df.mode().iloc[0], inplace=True)
df.head()

## 7. Create a new column 'Salary_Range' by categorizing 'Base_Salary' into Low, Medium, High.

In [None]:
bins = [0, 60000, 100000, 150000]
labels = ['Low', 'Medium', 'High']
df['Salary_Range'] = pd.cut(df['Base_Salary'], bins=bins, labels=labels)
df.head()

## 8. Calculate the correlation matrix for numerical columns.

In [None]:
df.corr()

## 9. Plot a histogram for the 'Total_Compensation' column.

In [None]:
df['Total_Compensation'].hist(bins=20)

## 10. Plot a bar chart for the 'Department' column.

In [None]:
df['Department'].value_counts().plot(kind='bar')

## 11. Plot a scatter plot between 'Years_of_Experience' and 'Base_Salary'.

In [None]:
df.plot.scatter(x='Years_of_Experience', y='Base_Salary')

## 12. Encode the 'Education_Level' column using one-hot encoding.

In [None]:
df = pd.get_dummies(df, columns=['Education_Level'])
df.head()

## 13. Normalize the 'Base_Salary' column using Min-Max scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Base_Salary'] = scaler.fit_transform(df[['Base_Salary']])
df.head()

## 14. Standardize the 'Performance_Rating' column.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['Performance_Rating'] = scaler.fit_transform(df[['Performance_Rating']])
df.head()

## 15. Create a pivot table showing the average 'Base_Salary' for each 'Role'.

In [None]:
df.pivot_table(values='Base_Salary', index='Role', aggfunc='mean')

## 16. Group the data by 'Gender' and calculate the mean 'Total_Compensation'.

In [None]:
df.groupby('Gender')['Total_Compensation'].mean()

## 17. Filter the dataset for employees with 'Total_Compensation' greater than 100000.

In [None]:
df_filtered = df[df['Total_Compensation'] > 100000]
df_filtered.head()

## 18. Sort the dataset by 'Years_of_Experience' in descending order.

In [None]:
df.sort_values(by='Years_of_Experience', ascending=False).head()

## 19. Create a new column 'Bonus_Percentage' by dividing 'Bonus' by 'Base_Salary'.

In [None]:
df['Bonus_Percentage'] = df['Bonus'] / df['Base_Salary']
df.head()

## 20. Replace all instances of 'Other' in the 'Gender' column with 'Non-binary'.

In [None]:
df['Gender'].replace('Other', 'Non-binary', inplace=True)
df.head()

## 21. Rename the column 'Base_Salary' to 'Annual_Salary'.

In [None]:
df.rename(columns={'Base_Salary': 'Annual_Salary'}, inplace=True)
df.head()

## 22. Drop the 'City' column from the dataset.

In [None]:
df.drop(columns=['City'], inplace=True)
df.head()

## 23. Check for duplicate rows in the dataset.

In [None]:
df.duplicated().sum()

## 24. Drop any duplicate rows found.

In [None]:
df.drop_duplicates(inplace=True)
df.head()

## 25. Create a new DataFrame containing only 'Employee_ID', 'Name', and 'Total_Compensation'.

In [None]:
df_subset = df[['Employee_ID', 'Name', 'Total_Compensation']]
df_subset.head()

## 26. Merge the new DataFrame with the original dataset on 'Employee_ID'.

In [None]:
df_merged = pd.merge(df_subset, df, on='Employee_ID')
df_merged.head()

## 27. Create a cross-tabulation of 'Department' and 'Gender'.

In [None]:
pd.crosstab(df['Department'], df['Gender'])

## 28. Create a pivot table showing the count of employees by 'Education_Level' and 'Role'.

In [None]:
df.pivot_table(index='Education_Level_Bachelor\'s', columns='Role', aggfunc='size', fill_value=0)

## 29. Replace missing values in 'Performance_Rating' with the median.

In [None]:
df['Performance_Rating'].fillna(df['Performance_Rating'].median(), inplace=True)
df.head()

## 30. Calculate the Z-score for the 'Annual_Salary' column.

In [None]:
from scipy.stats import zscore
df['Annual_Salary_Zscore'] = zscore(df['Annual_Salary'])
df.head()

## 31. Filter out outliers in the 'Annual_Salary' column based on Z-score.

In [None]:
df_no_outliers = df[(df['Annual_Salary_Zscore'] > -3) & (df['Annual_Salary_Zscore'] < 3)]
df_no_outliers.head()

## 32. Apply a lambda function to the 'Total_Compensation' column to categorize into 'Low', 'Medium', 'High'.

In [None]:
df['Total_Compensation_Category'] = df['Total_Compensation'].apply(lambda x: 'Low' if x < 75000 else 'Medium' if x < 125000 else 'High')
df.head()

## 33. Create a new column 'Years_to_Retirement' by subtracting 'Age' from 65.

In [None]:
df['Years_to_Retirement'] = 65 - df['Age']
df.head()

## 34. Extract the first letter from 'Name' and create a new column 'Name_Initial'.

In [None]:
df['Name_Initial'] = df['Name'].str[0]
df.head()

## 35. Convert the 'Role' column to a categorical data type.

In [None]:
df['Role'] = df['Role'].astype('category')
df.dtypes

## 36. Create a box plot for 'Annual_Salary' across different 'Department'.

In [None]:
df.boxplot(column='Annual_Salary', by='Department')

## 37. Create a line plot showing the trend of 'Total_Compensation' over 'Years_of_Experience'.

In [None]:
df.groupby('Years_of_Experience')['Total_Compensation'].mean().plot()

## 38. Create a heatmap for the correlation matrix.

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), annot=True)

## 39. Filter the dataset to include only employees with 'Education_Level' as 'Master's'.

In [None]:
df_masters = df[df['Education_Level_Master\'s'] == 1]
df_masters.head()

## 40. Replace outliers in 'Bonus' with the median value.

In [None]:
median_value = df['Bonus'].median()
df.loc[df['Bonus'] > df['Bonus'].quantile(0.99), 'Bonus'] = median_value
df.head()

## 41. Calculate the percentage of missing values in each column.

In [None]:
missing_percent = df.isnull().mean() * 100
missing_percent

## 42. Reorder the columns so that 'Employee_ID' is the first column.

In [None]:
df = df[['Employee_ID'] + [col for col in df.columns if col != 'Employee_ID']]
df.head()

## 43. Create a column 'High_Performer' which is True if 'Performance_Rating' > 4, else False.

In [None]:
df['High_Performer'] = df['Performance_Rating'] > 4
df.head()

## 44. Split the dataset into training (80%) and testing (20%) sets based on 'Employee_ID'.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape, test.shape

## 45. Create a pipeline to preprocess the 'Annual_Salary' and 'Performance_Rating' columns.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler())
])
df[['Annual_Salary', 'Performance_Rating']] = pipeline.fit_transform(df[['Annual_Salary', 'Performance_Rating']])
df.head()

## 46. Export the cleaned dataset to a new CSV file.

In [None]:
df.to_csv('cleaned_salary_dataset.csv', index=False)

## 47. Save the dataset in Excel format with multiple sheets based on 'Department'.

In [None]:
with pd.ExcelWriter('salary_by_department.xlsx') as writer:
    for department in df['Department'].unique():
        df[df['Department'] == department].to_excel(writer, sheet_name=department, index=False)

## 48. Create a summary report of the dataset including key statistics and visualizations.

In [None]:
df.describe(include='all')

## 49. Write a function to automate the data cleaning process for this dataset.

In [None]:
def clean_data(df):
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.fillna(df.mean(), inplace=True)
    return df

cleaned_df = clean_data(df.copy())
cleaned_df.head()

## 50. Create a new column 'Compensation_Growth_Potential' using a formula on 'Years_of_Experience' and 'Performance_Rating'.

In [None]:
df['Compensation_Growth_Potential'] = df['Years_of_Experience'] * df['Performance_Rating']
df.head()