## 1. Load the dataset and display the first 5 rows.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv('training_dataset_unique_ids.csv')
df.head()

## 2. Display the summary statistics of the dataset.

In [None]:
df.describe()

## 3. Check for missing values in the dataset.

In [None]:
df.isnull().sum()

## 4. Drop rows with any missing values.

In [None]:
df.dropna(inplace=True)
df.head()

## 5. Fill missing values with the mean for numerical columns.

In [None]:
df.fillna(df.mean(), inplace=True)
df.head()

## 6. Fill missing values with the mode for categorical columns.

In [None]:
df.fillna(df.mode().iloc[0], inplace=True)
df.head()

## 7. Create a new column 'Age_Group' based on 'Age' using bins.

In [None]:
bins = [0, 18, 35, 50, 65, 80]
labels = ['Child', 'Young_Adult', 'Adult', 'Senior_Adult', 'Elder']
df['Age_Group'] = pd.cut(df['Age'], bins=bins, labels=labels)
df.head()

## 8. Calculate the correlation matrix for numerical columns.

In [None]:
df.corr()

## 9. Plot a histogram for the 'Annual_Income' column.

In [None]:
df['Annual_Income'].hist(bins=20)

## 10. Plot a bar chart for the 'Product_Category' column.

In [None]:
df['Product_Category'].value_counts().plot(kind='bar')

## 11. Plot a scatter plot between 'Annual_Income' and 'Spending_Score'.

In [None]:
df.plot.scatter(x='Annual_Income', y='Spending_Score')

## 12. Encode the 'Gender' column using label encoding.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])
df.head()

## 13. Encode the 'Product_Category' column using one-hot encoding.

In [None]:
df = pd.get_dummies(df, columns=['Product_Category'])
df.head()

## 14. Normalize the 'Annual_Income' column using Min-Max scaling.

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df['Annual_Income'] = scaler.fit_transform(df[['Annual_Income']])
df.head()

## 15. Standardize the 'Spending_Score' column.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df['Spending_Score'] = scaler.fit_transform(df[['Spending_Score']])
df.head()

## 16. Create a pivot table showing the average 'Purchase_Amount' for each 'City'.

In [None]:
df.pivot_table(values='Purchase_Amount', index='City', aggfunc='mean')

## 17. Group the data by 'Loyalty_Status' and calculate the mean 'Annual_Income'.

In [None]:
df.groupby('Loyalty_Status')['Annual_Income'].mean()

## 18. Filter the dataset for customers with 'Annual_Income' greater than 50,000.

In [None]:
df_filtered = df[df['Annual_Income'] > 50000]
df_filtered.head()

## 19. Sort the dataset by 'Spending_Score' in descending order.

In [None]:
df.sort_values(by='Spending_Score', ascending=False).head()

## 20. Create a new column 'Income_per_Year' by dividing 'Annual_Income' by 'Membership_Years'.

In [None]:
df['Income_per_Year'] = df['Annual_Income'] / df['Membership_Years']
df.head()

## 21. Replace all instances of 'Cash' in the 'Payment_Method' column with 'Other'.

In [None]:
df['Payment_Method'].replace('Cash', 'Other', inplace=True)
df.head()

## 22. Rename the column 'Annual_Income' to 'Income'.

In [None]:
df.rename(columns={'Annual_Income': 'Income'}, inplace=True)
df.head()

## 23. Drop the 'Complaint_Count' column from the dataset.

In [None]:
df.drop(columns=['Complaint_Count'], inplace=True)
df.head()

## 24. Check for duplicate rows in the dataset.

In [None]:
df.duplicated().sum()

## 25. Drop any duplicate rows found.

In [None]:
df.drop_duplicates(inplace=True)
df.head()

## 26. Create a new DataFrame containing only 'Customer_ID', 'Age', and 'City'.

In [None]:
df_subset = df[['Customer_ID', 'Age', 'City']]
df_subset.head()

## 27. Merge the new DataFrame with the original dataset on 'Customer_ID'.

In [None]:
df_merged = pd.merge(df_subset, df, on='Customer_ID')
df_merged.head()

## 28. Create a cross-tabulation of 'Gender' and 'Loyalty_Status'.

In [None]:
pd.crosstab(df['Gender'], df['Loyalty_Status'])

## 29. Create a pivot table showing the count of customers by 'City' and 'Product_Category_Clothing'.

In [None]:
df.pivot_table(index='City', columns='Product_Category_Clothing', aggfunc='size', fill_value=0)

## 30. Replace missing values in 'Spending_Score' with the median.

In [None]:
df['Spending_Score'].fillna(df['Spending_Score'].median(), inplace=True)
df.head()

## 31. Calculate the Z-score for the 'Income' column.

In [None]:
from scipy.stats import zscore
df['Income_Zscore'] = zscore(df['Income'])
df.head()

## 32. Filter out outliers in the 'Income' column based on Z-score.

In [None]:
df_no_outliers = df[(df['Income_Zscore'] > -3) & (df['Income_Zscore'] < 3)]
df_no_outliers.head()

## 33. Apply a lambda function to the 'Age' column to categorize into 'Young', 'Adult', 'Senior'.

In [None]:
df['Age_Category'] = df['Age'].apply(lambda x: 'Young' if x < 30 else 'Adult' if x < 60 else 'Senior')
df.head()

## 34. Create a new column 'Total_Spent' as 'Purchase_Amount' multiplied by 'Membership_Years'.

In [None]:
df['Total_Spent'] = df['Purchase_Amount'] * df['Membership_Years']
df.head()

## 36. Convert the 'Payment_Method' column to a categorical data type.

In [None]:
df['Payment_Method'] = df['Payment_Method'].astype('category')
df.dtypes

## 37. Create a box plot for 'Purchase_Amount' across different 'Product_Category_Electronics'.

In [None]:
df.boxplot(column='Purchase_Amount', by='Product_Category_Electronics')

## 38. Create a line plot showing the trend of 'Spending_Score' over 'Membership_Years'.

In [None]:
df.groupby('Membership_Years')['Spending_Score'].mean().plot()

## 39. Create a heatmap for the correlation matrix.

In [None]:
import seaborn as sns
sns.heatmap(df.corr(), annot=True)

## 40. Filter the dataset to include only customers from 'New York'.

In [None]:
df_ny = df[df['City'] == 'New York']
df_ny.head()

## 41. Replace outliers in 'Purchase_Amount' with the median value.

In [None]:
median_value = df['Purchase_Amount'].median()
df.loc[df['Purchase_Amount'] > df['Purchase_Amount'].quantile(0.99), 'Purchase_Amount'] = median_value
df.head()

## 42. Calculate the percentage of missing values in each column.

In [None]:
missing_percent = df.isnull().mean() * 100
missing_percent

## 43. Reorder the columns so that 'Customer_ID' is the first column.

In [None]:
df = df[['Customer_ID'] + [col for col in df.columns if col != 'Customer_ID']]
df.head()

## 44. Create a column 'High_Spender' which is True if 'Purchase_Amount' > 1500, else False.

In [None]:
df['High_Spender'] = df['Purchase_Amount'] > 1500
df.head()

## 45. Split the dataset into training (80%) and testing (20%) sets based on 'Customer_ID'.

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape, test.shape

## 46. Create a pipeline to preprocess the 'Income' and 'Spending_Score' columns.

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler())
])
df[['Income', 'Spending_Score']] = pipeline.fit_transform(df[['Income', 'Spending_Score']])
df.head()

## 47. Export the cleaned dataset to a new CSV file.

In [None]:
df.to_csv('cleaned_dataset.csv', index=False)

## 48. Save the dataset in Excel format with multiple sheets based on 'City'.

In [None]:
with pd.ExcelWriter('dataset_by_city.xlsx') as writer:
    for city in df['City'].unique():
        df[df['City'] == city].to_excel(writer, sheet_name=city, index=False)

## 49. Create a summary report of the dataset including key statistics and visualizations.

In [None]:
df.describe(include='all')

## 50. Write a function to automate the data cleaning process for this dataset.

In [None]:
def clean_data(df):
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    df.fillna(df.mean(), inplace=True)
    return df

cleaned_df = clean_data(df.copy())
cleaned_df.head()