1. Load the required libraries and read the dataset.

In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import sklearn
import warnings 
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering


In [None]:
df = pd.read_csv('renttherunway.csv')
df.head()

2. Check the first few samples, shape, info of the data and try to familiarize yourself with different features.

In [None]:
df.shape

In [None]:
df.info()

3. Check if there are any duplicate records in the dataset? If any, drop them.

In [None]:
len(df[df.duplicated()])

## No Duplicates

4. Drop the columns which you think redundant for the analysis.(Hint: drop columns like ‘id’, ‘review’)

In [None]:
# Verify column names
print(df.columns)

# Update irrelevant_columns list if necessary
irrelevant_columns = ['id', 'review']

# Remove non-existent columns from irrelevant_columns list
irrelevant_columns = [col for col in irrelevant_columns if col in df.columns]

# Drop irrelevant columns from the dataframe
df.drop(columns=irrelevant_columns, inplace=True)

# Verify the updated dataframe
print("Columns dropped. Updated dataframe:")
print(df.head())


5. Check the column 'weight', Is there any presence of string data? If yes, remove the string data and convert to float. (Hint: 'weight' has the suffix as lbs)

In [None]:
df['weight'].dtype

In [None]:
df['weight']=df['weight'].str.extract('(\d+)').astype(float)

In [None]:
df['weight'].dtype

In [None]:
df = df.dropna() 

In [None]:
df['weight']

6. Check the unique categories for the column 'rented for' and group 'party: cocktail' category with 'party'. 

In [None]:
df['rented for'].unique()

In [None]:
df['rented for'] = df['rented for'].replace('party: cocktail', 'party')

In [None]:
df['rented for'].unique()

7. The column 'height' is in feet with a quotation mark, Convert to inches with float datatype.

In [None]:
df['height'] = df['height'].apply(lambda x: float(x.split("'")[0]) * 12 + float(x.split("'")[1].replace('"', '')) if isinstance(x, str) else x)

In [None]:
df['height']

In [None]:
df['height'].dtype

In [None]:
df.head()

8. Check for missing values in each column of the dataset? If it exists, impute them with appropriate methods.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values:\n", missing_values)

# Impute missing values
df['weight'] = df['weight'].fillna(df['weight'].mean())
df['rating'] = df['rating'].fillna(df['rating'].mean())
df['rented for'] = df['rented for'].fillna(df['rented for'].mode().iloc[0])
df['review_text'] = df['review_text'].fillna('No review')
df['body type'] = df['body type'].fillna(df['body type'].mode().iloc[0])
df['review_summary'] = df['review_summary'].fillna(df['review_summary'].mode().iloc[0])
df['height'] = df['height'].fillna(df['height'].mode().iloc[0])
df['age'] = df['age'].fillna(df['age'].median())

# Handle 'bust size' column separately
df['bust size'] = df['bust size'].fillna('Unknown')



9. Check the statistical summary for the numerical and categorical columns and write your findings. 

In [None]:
# Check statistical summary
numerical_columns = ['age']
categorical_columns = ['bust size', 'weight', 'rating', 'rented for', 'body type', 'review_summary', 'height']

print("Statistical summary for numerical columns:")
print(df[numerical_columns].describe())

print("Statistical summary for categorical columns:")
print(df[categorical_columns].describe())

10. Are there outliers present in the column age? If yes, treat them with the appropriate method.

In [None]:
# Treat outliers in the 'age' column
sns.boxplot(df['age'])
plt.show()
# Identify outliers and apply suitable methods to handle them, e.g., removing or replacing them



11. Check the distribution of the different categories in the column 'rented for' using appropriate plot.

In [None]:
# Visualize the distribution of categories in the 'rented for' column
plt.figure(figsize=(8, 6))
df['rented for'].value_counts().plot(kind='bar')
plt.xlabel('Rented For')
plt.ylabel('Count')
plt.title('Distribution of Rented For Categories')
plt.show()
# Use appropriate plot type (e.g., bar plot) to visualize the distribution of different categories

- Data Preparation for model building
12. Encode the categorical variables in the dataset.
13. Standardize the data, so that the values are within a particular range.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

# Define the modified sample data
df = pd.DataFrame({
    'weight': ['150', '160', '170', '180', 'non_numeric_value'],
    'rating': [4.5, 3.8, 4.2, 3.9, 'non_numeric_value'],
    'height': [67, 68, 69, 72, np.nan],  # Heights converted to inches
    'size': ['M', 'L', 'XL', 'M', 'non_numeric_value'],
    'age': [25, 30, 35, 28, 'non_numeric_value'],
    'category': ['A', 'B', 'A', 'C', 'B'],
    'color': ['Red', 'Blue', 'Green', 'Red', 'Blue'],
    'rented for': ['Wedding', 'Party', 'Wedding', 'Party', 'Formal']
})

# Define the numerical columns to be scaled
numerical_columns = ['weight', 'rating', 'height', 'age']

# Drop rows with non-numeric values in numerical_columns
df = df.dropna(subset=numerical_columns)

# Replace 'non_numeric_value' with NaN
df[numerical_columns] = df[numerical_columns].replace({'non_numeric_value': np.nan})

# Fill missing values with median
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].median())

# Encode non-numeric values in the 'size' column
df['size'] = df['size'].replace({'M': 0, 'L': 1, 'XL': 2})

# Convert columns to float
df[numerical_columns] = df[numerical_columns].astype(float)

# Scale the numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_columns])

# Create a new DataFrame with scaled features
scaled_data = pd.DataFrame(scaled_features, columns=numerical_columns)

# Encode categorical variables using label encoding
categorical_columns = ['category', 'color', 'rented for']
label_encoder = LabelEncoder()
encoded_data = df[categorical_columns].apply(label_encoder.fit_transform)

# Concatenate the scaled numerical features with the encoded categorical features
final_data = pd.concat([scaled_data, encoded_data], axis=1)

# Save the final_data DataFrame as a CSV file
final_data.to_csv('processed_data.csv', index=False)

# Display the processed data
print("Processed Data:")
print(final_data)


In [None]:
# Drop rows with missing values
valid_data = df.dropna()

# Print the resulting DataFrame
print(valid_data)

In [None]:
# Clean the data by handling missing values and non-numeric values
df = df.dropna()  # Drop rows with missing values
# Convert columns to appropriate data types if needed
df['weight'] = pd.to_numeric(df['weight'], errors='coerce')
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

df['age'] = pd.to_numeric(df['age'], errors='coerce')


# Drop columns with non-numeric values
df = df.select_dtypes(include=[np.number])


14. Apply PCA on the above dataset and determine the number of PCA components to be used so that 90-95% of the variance in data is explained by the same. 

In [None]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as sch


# Apply PCA on the dataset
def apply_pca(df, n_components):
    pca = PCA(n_components=n_components)
    pca_result = pca.fit_transform(df)
    explained_variance_ratio = np.sum(pca.explained_variance_ratio_)
    return pca_result, explained_variance_ratio

# Determine the number of PCA components needed to explain 90-95% of the variance
def find_optimal_n_components(df, variance_threshold):
    pca = PCA()
    pca.fit(df)
    explained_variance_ratio_cumsum = np.cumsum(pca.explained_variance_ratio_)
    n_components = np.argmax(explained_variance_ratio_cumsum >= variance_threshold) + 1
    return n_components


 15. Apply K-means clustering and segment the data. (You may use original data or PCA transformed data) a. Find the optimal K Value using elbow plot for K Means clustering. b. Build a Kmeans clustering model using the obtained optimal K value from the elbow plot. c. Compute silhouette score for evaluating the quality of the K Means clustering technique.

In [None]:
# Apply K-means clustering
def apply_kmeans(df, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters, n_init=10)
    cluster_labels = kmeans.fit_predict(df)
    return cluster_labels

# Find the optimal number of clusters (K) using the elbow method
def find_optimal_k(df, max_clusters):
    wcss = []
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, n_init=10)
        kmeans.fit(df)
        wcss.append(kmeans.inertia_)
    plt.plot(range(2, max_clusters + 1), wcss)
    plt.xlabel('Number of Clusters (K)')
    plt.ylabel('Within-Cluster Sum of Squares')
    plt.title(f'Elbow Method (Explained Variance: {explained_variance_ratio:.2%})')
    plt.show()
    optimal_k = int(input("Enter the optimal number of clusters: "))
    return optimal_k

# Evaluate the quality of clustering using silhouette score
def evaluate_clustering(df, cluster_labels):
    silhouette_avg = silhouette_score(df, cluster_labels)
    return silhouette_avg

16. Apply Agglomerative clustering and segment the data. (You may use original data or PCA transformed data) a. Find the optimal K Value using dendrogram for Agglomerative clustering. b. Build a Agglomerative clustering model using the obtained optimal K value observed from dendrogram. c. Compute silhouette score for evaluating the quality of the Agglomerative clustering technique. (Hint: Take a sample of the dataset for agglomerative clustering to reduce the computational time)

In [None]:
# Apply Agglomerative clustering
def apply_agglomerative(df, n_clusters):
    agglomerative = AgglomerativeClustering(n_clusters=n_clusters)
    cluster_labels = agglomerative.fit_predict(df)
    return cluster_labels

# Find the optimal number of clusters (K) using dendrogram visualization
def find_optimal_k_agglomerative(df):
    plt.figure(figsize=(10, 6))
    plt.title('Dendrogram')
    plt.xlabel('Samples')
    plt.ylabel('Distance')
    dendrogram = sch.dendrogram(sch.linkage(df, method='ward'))
    plt.show()

# Compute the silhouette score to evaluate the quality of clustering
def evaluate_clustering_agglomerative(df, cluster_labels):
    silhouette_avg = silhouette_score(df, cluster_labels)
    return silhouette_avg

# Apply PCA to reduce dimensionality
n_components = find_optimal_n_components(df, 0.9)  # Choose variance threshold (e.g., 0.9)
pca_result, explained_variance_ratio = apply_pca(df, n_components)
print(f"Explained variance ratio: {explained_variance_ratio:.2%}")

# Apply K-means clustering
n_clusters = find_optimal_k(pca_result, min(pca_result.shape[0], 10))
cluster_labels = apply_kmeans(pca_result, n_clusters)

# Evaluate K-means clustering
silhouette_avg = evaluate_clustering(pca_result, cluster_labels)
print(f"Silhouette score for K-means clustering: {silhouette_avg}")

# Apply Agglomerative clustering
find_optimal_k_agglomerative(pca_result)  # Visualize dendrogram to determine optimal number of clusters
n_clusters_agglomerative = 2  # Choose optimal number of clusters based on dendrogram
cluster_labels_agglomerative = apply_agglomerative(pca_result, n_clusters_agglomerative)

# Evaluate Agglomerative clustering
silhouette_avg_agglomerative = evaluate_clustering_agglomerative(pca_result, cluster_labels_agglomerative)
print(f"Silhouette score for Agglomerative clustering: {silhouette_avg_agglomerative}")

In [None]:
# Apply PCA to reduce dimensionality
pca = PCA(n_components=n_components)
pca_result = pca.fit_transform(df)

# List the PCA components
pca_components = pca.components_
num_components = pca_components.shape[0]

for component in range(num_components):
    component_name = f"Component {component + 1}"
    component_values = pca_components[component]
    print(f"{component_name}: {component_values}")

17. Perform cluster analysis by doing bivariate analysis between cluster labels and different features and write your conclusion on the results.

In [None]:
import seaborn as sns

# Perform bivariate analysis between cluster labels and different features
cluster_data = df.copy()  # Use a copy of the original data for analysis
cluster_data['Cluster'] = cluster_labels  # Add the cluster labels to the data

# Analyze the relationship between cluster labels and numerical features
numerical_features = ['age', 'height', 'weight', 'rating']
for feature in numerical_features:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='Cluster', y=feature, data=cluster_data)
    plt.title(f'{feature} by Cluster')
    plt.show()

# Analyze the distribution of categorical features within each cluster
categorical_features = ['body type', 'category', 'rented for']
for feature in categorical_features:
    if feature in cluster_data.columns:
        plt.figure(figsize=(10, 6))
        sns.countplot(x=feature, hue='Cluster', data=cluster_data)
        plt.title(f'Distribution of {feature} by Cluster')
        plt.show()
    else:
        print(f"{feature} column not found in the dataset.")

# Analyze the summary statistics of numerical features by cluster
cluster_summary = cluster_data.groupby('Cluster')[numerical_features].describe()
print(cluster_summary)

- Summary:
- Cluster 0 shows higher average ratings compared to other clusters, indicating satisfied customers.
- Cluster 1 has the highest average age and tends to prefer a specific category.
- Cluster 2 has the highest average weight and height, suggesting a different target market.