<a href="https://colab.research.google.com/github/chaitanyaj2121/Comp-Multiservises-mini-project/blob/main/amlca1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1.	Perform K means clustering on the airlines dataset to obtain optimum number of clusters. Draw the inferences from the clusters obtained. Refer to EastWestAirlines.xlsx dataset.

In [None]:
import pandas as pd

# Load the dataset
file_path = '/content/EastWestAirlines (1).xlsx'
xls = pd.ExcelFile(file_path)

# Display sheet names to understand the structure of the file
xls.sheet_names


In [None]:
# Load the 'data' sheet which likely contains the relevant dataset
data = pd.read_excel(file_path, sheet_name='data')

# Display the first few rows to understand the structure of the dataset
data.head()


In [None]:
# Checking for missing values and basic statistics
data.info(), data.describe()


In [None]:
from sklearn.preprocessing import StandardScaler

# Drop irrelevant columns
data_clean = data.drop(columns=['ID#'])

# Scale the data using StandardScaler
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data_clean)

# Check the scaled data
scaled_data[:5]


In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Define the range for k values (number of clusters)
k_range = range(1, 11)
wcss = []  # Within-Cluster Sum of Squares

# Calculate WCSS for each k
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    wcss.append(kmeans.inertia_)

# Plot the Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss, 'bo-', color='blue')
plt.title('Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.xticks(k_range)
plt.grid(True)
plt.show()


In [None]:
# Optimizing the Elbow Method by limiting iterations and initializing once
wcss_optimized = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, max_iter=300, n_init=10, random_state=42)
    kmeans.fit(scaled_data)
    wcss_optimized.append(kmeans.inertia_)

# Plot the optimized Elbow Method graph
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss_optimized, 'bo-', color='blue')
plt.title('Optimized Elbow Method for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.xticks(k_range)
plt.grid(True)
plt.show()


In [None]:
from sklearn.decomposition import PCA

# Apply PCA to retain 95% variance
pca = PCA(n_components=0.95)
scaled_data_pca = pca.fit_transform(scaled_data)

# Check the reduced dimensions
scaled_data_pca.shape


In [None]:
from sklearn.cluster import MiniBatchKMeans

# Define the range for k values (number of clusters)
k_range = range(1, 11)
wcss_mini_batch = []

# Apply MiniBatchKMeans for each k
for k in k_range:
    mini_kmeans = MiniBatchKMeans(n_clusters=k, batch_size=100, random_state=42)
    mini_kmeans.fit(scaled_data)
    wcss_mini_batch.append(mini_kmeans.inertia_)

# Plot the Elbow Method graph using MiniBatchKMeans
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss_mini_batch, 'bo-', color='green')
plt.title('Elbow Method with MiniBatchKMeans for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.xticks(k_range)
plt.grid(True)
plt.show()


In [None]:
import numpy as np

# Sample 50% of the data for faster computation
np.random.seed(42)
sample_indices = np.random.choice(len(scaled_data), size=int(len(scaled_data) * 0.5), replace=False)
sampled_data = scaled_data[sample_indices]

# Define smaller batch size for MiniBatchKMeans
wcss_sampled = []

for k in k_range:
    mini_kmeans = MiniBatchKMeans(n_clusters=k, batch_size=50, random_state=42)
    mini_kmeans.fit(sampled_data)
    wcss_sampled.append(mini_kmeans.inertia_)

# Plot the Elbow Method graph with sampled data
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss_sampled, 'bo-', color='purple')
plt.title('Elbow Method (Sampled Data) for Optimal K')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('WCSS (Within-Cluster Sum of Squares)')
plt.xticks(k_range)
plt.grid(True)
plt.show()


# 2.	Perform clustering for the crime data and identify the number of clusters            formed and draw inferences. Refer to crime_data.csv dataset.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns

# Load the dataset
file_path = '/content/crime_data (1).csv'
crime_data = pd.read_csv(file_path)

# 1. Business Problem
# 1.1 Business Objective: Identify clusters of states based on crime rates to support targeted policy making.
# 1.2 Constraints: No explicit constraints, but data normalization is needed due to scale differences.

# 2. Data Dictionary
# - State: Name of the state
# - Murder: Murder rate per 100,000 people
# - Assault: Assault rate per 100,000 people
# - UrbanPop: Percent urban population
# - Rape: Rape rate per 100,000 people

# 3. Data Pre-processing
# 3.1 Data Cleaning
crime_data.set_index('Unnamed: 0', inplace=True)

# Check for missing values
print(crime_data.isnull().sum())

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(crime_data)

# 4. Exploratory Data Analysis (EDA)
# 4.1 Summary
print(crime_data.describe())

# 4.2 Univariate Analysis
crime_data.hist(bins=15, figsize=(15,10))
plt.show()

# 4.3 Bivariate Analysis
sns.pairplot(crime_data)
plt.show()

# 5. Model Building
# 5.1 Elbow Method to find optimal number of clusters
inertia = []
k_range = range(1, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Scree Plot
plt.figure(figsize=(8, 5))
plt.plot(k_range, inertia, 'bo-')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.grid(True)
plt.show()

# 5.2 K-Means Clustering with k=4
kmeans = KMeans(n_clusters=4, random_state=42)
crime_data['Cluster'] = kmeans.fit_predict(scaled_data)

# 5.3 Validate and Compare Results
cluster_summary = crime_data.groupby('Cluster').mean()
print(cluster_summary)

# Visualize Clusters
sns.pairplot(crime_data, hue='Cluster')
plt.show()

# 6. Benefits/Impact
# - Helps in identifying high crime rate areas for resource allocation.
# - Supports targeted law enforcement strategies.
# - Aids in socio-economic policy development to reduce crime rates.


# [3.	Analyze the information given in the following ‘Insurance Policy dataset’ to             create clusters of persons falling in the same type. Refer to Insurance Dataset.csv](https://)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist

# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_csv(next(iter(uploaded)))

# Display basic information
print(df.info())
print(df.head())

# Data Dictionary (Manually create based on dataset)

# Data Preprocessing
df.dropna(inplace=True)  # Handling missing values
df.drop_duplicates(inplace=True)  # Removing duplicates

# Encoding categorical variables (if any)
df = pd.get_dummies(df, drop_first=True)

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Finding the optimal number of clusters using Elbow method
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Choose optimal K (from the plot) and fit K-Means
optimal_k = 3  # Change based on elbow method result
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualizing the clusters
plt.figure(figsize=(8,5))
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=df['Cluster'], palette='viridis')
plt.xlabel(df.columns[0])
plt.ylabel(df.columns[1])
plt.title('Clusters of Customers')
plt.show()

# Display cluster insights
print(df.groupby('Cluster').mean())


# 4.	Perform clustering analysis on the telecom dataset. The data is a mixture of both categorical and numerical data. It consists of the number of customers who churn. Derive insights and get possible information on factors that may affect the churn decision. Refer to Telco_customer_churn.xlsx dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_excel(next(iter(uploaded)))

# Display basic information
print(df.info())
print(df.head())

# Data Preprocessing
df.dropna(inplace=True)  # Handling missing values
df.drop_duplicates(inplace=True)  # Removing duplicates

# Encoding categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Standardizing the numerical data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Finding the optimal number of clusters using Elbow method
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Choose optimal K (from the plot) and fit K-Means
optimal_k = 3  # Change based on elbow method result
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualizing the clusters (for first two features)
plt.figure(figsize=(8,5))
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=df['Cluster'], palette='viridis')
plt.xlabel(df.columns[0])
plt.ylabel(df.columns[1])
plt.title('Clusters of Customers')
plt.show()

# Display cluster insights
print(df.groupby('Cluster').mean())


# 5.	Perform clustering on mixed data. Convert the categorical variables to numeric by using dummies or label encoding and perform normalization techniques. The dataset has the details of customers related to their auto insurance. Refer to Autoinsurance.csv dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans

# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_csv(next(iter(uploaded)))

# Display basic information
print(df.info())
print(df.head())

# Data Preprocessing
df.dropna(inplace=True)  # Handling missing values
df.drop_duplicates(inplace=True)  # Removing duplicates

# Encoding categorical variables
categorical_cols = df.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Standardizing the numerical data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Finding the optimal number of clusters using Elbow method
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters')
plt.show()

# Choose optimal K (from the plot) and fit K-Means
optimal_k = 3  # Change based on elbow method result
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['Cluster'] = kmeans.fit_predict(scaled_data)

# Visualizing the clusters (for first two features)
plt.figure(figsize=(8,5))
sns.scatterplot(x=df.iloc[:, 0], y=df.iloc[:, 1], hue=df['Cluster'], palette='viridis')
plt.xlabel(df.columns[0])
plt.ylabel(df.columns[1])
plt.title('Clusters of Customers')
plt.show()

# Display cluster insights
print(df.groupby('Cluster').mean())


# Topic: Dimension Reduction With PCA

# Problem Statement: -
Perform hierarchical and K-means clustering on the dataset. After that, perform PCA on
the dataset and extract the first 3 principal components and make a new dataset with
these 3 principal components as the columns. Now, on this new dataset, perform
hierarchical and K-means clustering. Compare the results of clustering on the original
dataset and clustering on the principal components dataset (use the scree plot
technique to obtain the optimum number of clusters in K-means clustering and check if
you’re getting similar results with and without PCA).

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_csv(next(iter(uploaded)))

# Display basic information
print(df.info())
print(df.head())

# Data Preprocessing
df.dropna(inplace=True)  # Handling missing values
df.drop_duplicates(inplace=True)  # Removing duplicates

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Finding the optimal number of clusters using Elbow method (for K-Means)
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters (K-Means)')
plt.show()

# Apply K-Means Clustering
optimal_k = 3  # Change based on elbow method result
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['KMeans_Cluster'] = kmeans.fit_predict(scaled_data)

# Apply Hierarchical Clustering
plt.figure(figsize=(10, 5))
dendrogram(linkage(scaled_data, method='ward'))
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Fit Agglomerative Clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
df['Hierarchical_Cluster'] = hierarchical.fit_predict(scaled_data)

# Perform PCA (Principal Component Analysis)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(scaled_data)

# Create a new DataFrame with principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])

# Finding the optimal number of clusters for PCA data
inertia_pca = []
for k in K:
    kmeans_pca = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_pca.fit(pca_df)
    inertia_pca.append(kmeans_pca.inertia_)

# Plot the Elbow method for PCA data
plt.figure(figsize=(8,5))
plt.plot(K, inertia_pca, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for PCA Data (K-Means)')
plt.show()

# Apply K-Means Clustering on PCA Data
kmeans_pca = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
pca_df['KMeans_Cluster_PCA'] = kmeans_pca.fit_predict(pca_df)

# Apply Hierarchical Clustering on PCA Data
plt.figure(figsize=(10, 5))
dendrogram(linkage(pca_df, method='ward'))
plt.title('Dendrogram for PCA Data (Hierarchical Clustering)')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Fit Agglomerative Clustering on PCA Data
hierarchical_pca = AgglomerativeClustering(n_clusters=optimal_k)
pca_df['Hierarchical_Cluster_PCA'] = hierarchical_pca.fit_predict(pca_df)

# Compare Clustering Results Before and After PCA
print("Original Data Clusters (K-Means):")
print(df.groupby('KMeans_Cluster').mean())

print("\nPCA Data Clusters (K-Means):")
print(pca_df.groupby('KMeans_Cluster_PCA').mean())


## Problem Statement: -
A pharmaceuticals manufacturing company is conducting a study on a new medicine to treat
heart diseases. The company has gathered data from its secondary sources and would like you
to provide high level analytical insights on the data. Its aim is to segregate patients depending
on their age group and other factors given in the data. Perform PCA and clustering algorithms on
the dataset and check if the clusters formed before and after PCA are the same and provide a
brief report on your model. You can also explore more ways to improve your model.
Note: This is just a snapshot of the data. The datasets can be downloaded from AiSpry LMS in
the Hands-On Material section.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# Load the dataset
from google.colab import files
uploaded = files.upload()

# Read the dataset
df = pd.read_csv(next(iter(uploaded)))

# Display basic information
print(df.info())
print(df.head())

# Data Preprocessing
df.dropna(inplace=True)  # Handling missing values
df.drop_duplicates(inplace=True)  # Removing duplicates

# Encoding categorical variables if present
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Finding the optimal number of clusters using Elbow method (for K-Means)
inertia = []
K = range(1, 11)
for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# Plot the Elbow method
plt.figure(figsize=(8,5))
plt.plot(K, inertia, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal Clusters (K-Means)')
plt.show()

# Apply K-Means Clustering
optimal_k = 3  # Change based on elbow method result
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df['KMeans_Cluster'] = kmeans.fit_predict(scaled_data)

# Apply Hierarchical Clustering
plt.figure(figsize=(10, 5))
dendrogram(linkage(scaled_data, method='ward'))
plt.title('Dendrogram for Hierarchical Clustering')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Fit Agglomerative Clustering
hierarchical = AgglomerativeClustering(n_clusters=optimal_k)
df['Hierarchical_Cluster'] = hierarchical.fit_predict(scaled_data)

# Perform PCA (Principal Component Analysis)
pca = PCA(n_components=3)
principal_components = pca.fit_transform(scaled_data)

# Create a new DataFrame with principal components
pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2', 'PC3'])

# Finding the optimal number of clusters for PCA data
inertia_pca = []
for k in K:
    kmeans_pca = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans_pca.fit(pca_df)
    inertia_pca.append(kmeans_pca.inertia_)

# Plot the Elbow method for PCA data
plt.figure(figsize=(8,5))
plt.plot(K, inertia_pca, 'bo-')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for PCA Data (K-Means)')
plt.show()

# Apply K-Means Clustering on PCA Data
kmeans_pca = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
pca_df['KMeans_Cluster_PCA'] = kmeans_pca.fit_predict(pca_df)

# Apply Hierarchical Clustering on PCA Data
plt.figure(figsize=(10, 5))
dendrogram(linkage(pca_df, method='ward'))
plt.title('Dendrogram for PCA Data (Hierarchical Clustering)')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Fit Agglomerative Clustering on PCA Data
hierarchical_pca = AgglomerativeClustering(n_clusters=optimal_k)
pca_df['Hierarchical_Cluster_PCA'] = hierarchical_pca.fit_predict(pca_df)

# Compare Clustering Results Before and After PCA
print("Original Data Clusters (K-Means):")
print(df.groupby('KMeans_Cluster').mean())

print("\nPCA Data Clusters (K-Means):")
print(pca_df.groupby('KMeans_Cluster_PCA').mean())
