In [None]:
import joblib  
import numpy as np  
import pandas as pd
import hvplot.pandas
import seaborn as sns 
import matplotlib.pyplot as plt 
from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA
from sqlalchemy import create_engine, MetaData
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st 

# Load dataframe and reassess 

In [None]:
# Database Setup
engine = create_engine("sqlite:///Coding/Data_Engineering.db")

# Reflect the tables
metadata = MetaData()
metadata.reflect(bind=engine)
coffee_data = metadata.tables['coffee_data']

# Fetch data from the database
conn = engine.connect()
result = conn.execute(coffee_data.select()).fetchall()
conn.close()

# Convert each row to a dictionary
data = [dict(row._asdict()) for row in result]

# Create a DataFrame
coffee_data_df = pd.DataFrame(data)

In [None]:
# Check database has been loaded 
coffee_data_df

## Remove null values for modelling and training 

In [None]:
# Create features and target
coffee_data_df_features = coffee_data_df[['aroma','body','flavor','acid','aftertaste']]

# Elbow Curve review 

In [None]:
# Select features for clustering
features = ['aroma', 'flavor', 'acid', 'body', 'aftertaste'] 

In [None]:
# Calculate mean, median, standard deviation, and range for each feature
summary_stats = coffee_data_df_features[features].describe().transpose()

# Display the summary statistics
print(summary_stats)

- <b> Aroma: </b> Interpretation: The average aroma rating is around 8.47, with a moderate level of variability (standard deviation of 0.70). Ratings range from 4.0 to 10.0, with the majority falling between 8.0 and 9.0.
- <b> Flavor: </b>  Interpretation: The average flavor rating is around 8.62, with a moderate level of variability (standard deviation of 0.73). Ratings range from 1.0 to 10.0, with the majority falling between 8.0 and 9.0.
- <b> Acid: </b> Interpretation: The average acidity rating is around 7.97, with a moderate level of variability (standard deviation of 0.71). Ratings range from 3.0 to 10.0, with the majority falling between 8.0 and 8.0
- <b> Body: </b> Interpretation: The average body rating is around 8.16, with a relatively low level of variability (standard deviation of 0.63). Ratings range from 6.0 to 10.0, with the majority falling between 8.0 and 9.0.
- <b> Interpretation: </b> The average aftertaste rating is around 7.97, with a moderate level of variability (standard deviation of 0.71). Ratings range from 3.0 to 10.0, with the majority falling between 8.0 and 8.0

This is due to the fact that the ratings are over 60 which means that these are coffees are probably on the higher end of coffee ratings. 

In [None]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
# Fit the Standard Scaler with the training data
coffee_data_scaled =scaler.fit_transform(coffee_data_df_features[features])

In [None]:
# Create a list to store inertia values and the values of k
inertia = []
k_values = list(range(1, 11))

# Append the value of the computed inertia from the KMeans model
for k in k_values:
    k_model = KMeans(n_clusters=k, n_init=10, random_state=1)
    k_model.fit(coffee_data_scaled)
    inertia.append(k_model.inertia_)

# Plot the Elbow Curve
plt.plot(k_values, inertia, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Curve')
plt.show()


# PCA Application 

In [None]:
# Reduce dimensionality with PCA
pca = PCA(n_components=5)
coffee_data_pca = pca.fit_transform(coffee_data_scaled)

In [None]:
# Calculate the PCA explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_

# Print the explained variance ratio for each component
print("Explained Variance Ratio for Each Component:")
print(explained_variance_ratio)

### Summary: 
- <b> Component 1 (60.95%): </b> This component explains a significant portion of the dataset's variance, suggesting that it captures the most critical patterns or features in the original data. It can be considered the most dominant factor in the reduced-dimensional representation.

- <b> Component 2 (20.51%): </b> While not as influential as the first component, the second component still contributes substantially to the overall variance. It captures additional patterns that are orthogonal to those captured by the first component.

- <b> Component 3 (11.98%): </b> This component contributes less to the overall variance but still captures unique patterns in the data that were not accounted for by the first two components.

- <b> Component 4 (6.51%): </b> This component explains a smaller portion of the variance, capturing patterns that are less significant in the overall structure of the data.

- <b> Component 5 (0.60%): </b> The fifth component explains a very small amount of the total variance. In some cases, such a component may be considered noise or might not contribute significantly to the understanding of the dataset.

In [None]:
# Calculate the cumulative explained variance
cumulative_explained_variance = explained_variance_ratio.cumsum()

# Print the cumulative explained variance
print("Cumulative Explained Variance:")
print(cumulative_explained_variance)


In [None]:
# Plot the cumulative explained variance
cumulative_explained_variance = explained_variance_ratio.cumsum()
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance vs. Number of Components')
plt.show()

This showed that the first three components capture approximately 93.44% (60.95% + 20.51% + 11.98%) of the total variance which is a generally satisfactory amount for a dataset this small.

In [None]:
# Create the PCA DataFrame
coffee_pca_df = pd.DataFrame(
    coffee_data_pca,
    columns=["aroma", "flavor", "acid", "body", "aftertaste"]
)

# Review the PCA DataFrame
coffee_pca_df.head()

In [None]:
# Create a list to store inertia values and the values of k
inertia = []
k = list(range(1, 11))

# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, n_init=10, random_state=1)  # Explicitly set n_init
    k_model.fit(coffee_pca_df)
    inertia.append(k_model.inertia_)

# Define a DataFrame to hold the values for k and the corresponding inertia
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)

In [None]:
# Review the DataFrame
print("PCA Elbow Curve DataFrame:")
print(df_elbow.head())

In [None]:
# Plot the Elbow Curve
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title=" PCA Elbow Curve", 
    xticks=k
)

In [None]:
# Define a KMeans model
kmeans_model = KMeans(n_clusters=3, random_state=42)
coffee_data_df['coffee_segments'] = kmeans_model.fit_predict(coffee_data_scaled)

In [None]:
# Define the model with 3 clusters
model = KMeans(n_clusters=3, random_state=0, n_init =10)

# Fit the model
model.fit(coffee_pca_df)

# Make predictions
k_3 = model.predict(coffee_pca_df)

# Create a copy of the PCA DataFrame
coffee_info_pca_predictions_df = coffee_pca_df.copy()

# Add a class column with the labels
coffee_info_pca_predictions_df["coffee_segments"] = k_3

In [None]:
# Create a pair plot where scatter plots show relationships between feature components 
sns.pairplot(
    coffee_info_pca_predictions_df,
    hue='coffee_segments',  # color by clusters
    palette='viridis',      # colormap
    diag_kind='kde',         # use kernel density estimates on the diagonal
    height=2.5
)

plt.show()

In [None]:
# Save the models
joblib.dump(model, "models/kmeans_model.joblib")
joblib.dump(pca, "models/pca_model.joblib")

# Load the KMeans model
loaded_model = joblib.load("models/kmeans_model.joblib")
loaded_pca = joblib.load("models/pca_model.joblib")

## Encode target and include one-hot encoding into dataframe 

In [None]:
# labelEncode the slug and create a feature array 
from sklearn.preprocessing import LabelEncoder 

# Label encode the 'slug' column and create a new column 'target'
label_encoder = LabelEncoder()
coffee_data_df['target'] = label_encoder.fit_transform(coffee_data_df['rating'])

# Drop the original 'slug' column
coffee_data_df.drop('rating', axis=1, inplace=True)
coffee_data_df.head()

In [None]:
# Make the target column the first column for ease access 
target_column = coffee_data_df['target']

# Drop the 'target' column from the DataFrame
coffee_data_df.drop('target', axis=1, inplace=True)

# Insert the 'target' column as the first column
coffee_data_df.insert(0, 'target', target_column)

In [None]:
# Create features and target
coffee_test = coffee_data_df[['aroma','body','flavor','acid','aftertaste']]

# # Remove null values  
# coffee_data_df = coffee_data_df.dropna()

In [None]:
# Assuming X is your DataFrame with columns 'aroma', 'body', 'flavor', 'acid', and 'aftertaste'
categorical_columns = ['aroma', 'body', 'flavor', 'acid', 'aftertaste']

# Use get_dummies to one-hot encode the categorical columns
coffee_test_encoded = pd.get_dummies(coffee_test[categorical_columns], columns=categorical_columns, dtype=int)

In [None]:
# Concatenate the one-hot encoded columns to the original DataFrame
coffee_test = pd.concat([coffee_test, coffee_test_encoded], axis=1)

In [None]:
coffee_test.head()

Generally a one-hot encoded dataframe would be able to give us user preference information. In this dataset, as it is being used as a recommender system and also there is no user input there was no y variable that we could conclude on to assess. Therefore the function test relied on the cosine_similarity function based on the user input.

In future projects, user input and ratings would be generally great information to look for.

## Attempt use of Vectorisation 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fill missing values in desc_1 column with an empty string
coffee_data_df['desc_1'] = coffee_data_df['desc_1'].fillna('')

# Convert desc_1 column to string type
desc_1_text = coffee_data_df['desc_1'].astype(str)

# Create a matrix for desc_1
X_desc_1 = CountVectorizer().fit_transform(desc_1_text).toarray()

print("Matrix for desc_1:")
print(X_desc_1)


In [None]:
def calculate_cosine_similarity(matrix, row_idx1, row_idx2):
    cos_sim = cosine_similarity(matrix[row_idx1, :].reshape(1, -1), matrix[row_idx2, :].reshape(1, -1))[0, 0]
    return cos_sim

# Iterate through rows using iterrows
for idx1, row1 in coffee_data_df.iterrows():
    for idx2, row2 in coffee_data_df.iterrows():
        if idx1 < idx2:  # Avoid comparing the same document and duplicates
            cos_sim = calculate_cosine_similarity(X_desc_1, idx1, idx2)
            print(f'Cosine Similarity between coffee_id {idx1 + 1} and coffee_id {idx2 + 1}: {cos_sim}')
            


In [None]:
def find_top_similar_items(cosine_similarity_matrix):
    # Iterate through rows
    for i in range(cosine_similarity_matrix.shape[0]):
        # Sort cosine similarity values in descending order and get the top 5 indices
        top5_indices = cosine_similarity_matrix[i].argsort()[:-6:-1]

        # Print the results
        print(f'Top 5 most similar items to Document {i + 1}:')
        for j in top5_indices:
            print(f'\tDocument {j + 1} - Similarity: {cosine_similarity_matrix[i, j]}')
            
# Find top 5 most similar items for each document
find_top_similar_items(cos_sim) 

# Testing function 

In [None]:
# Generate test data
coffee_test = pd.DataFrame({
    'aroma': np.random.randint(1, 11, 5),
    'body': np.random.randint(1, 11, 5),
    'flavor': np.random.randint(1, 11, 5),
    'acid': np.random.randint(1, 11, 5),
    'aftertaste': np.random.randint(1, 11, 5)
})

In [None]:
# Select a random coffee to test recommender system
test_rec = coffee_test.sample(1, replace=False).squeeze()

In [None]:
# Recommender function 
def coffee_recommender(aroma, flavor, acid, body, aftertaste, model, pca, data, top_n=5):
    # Transform user input using PCA
    input_data = pca.transform([[aroma, flavor, acid, body, aftertaste]])[0]
    input_array = input_data.reshape(1, -1)

    # Make predictions using the loaded KMeans model
    cluster_label = model.predict(input_array)[0]  # Fix input shape here

    # Assign the cluster label to a new column
    data['coffee_segments'] = model.predict(pca.transform(data[['aroma', 'flavor', 'acid', 'body', 'aftertaste']]))

    # Filter data for the predicted cluster
    recommended_coffees = data[data['coffee_segments'] == cluster_label]

    # Calculate cosine similarity for each row
    input_array = input_data.reshape(1, -1)
    recommended_coffees['similarity_factor'] = recommended_coffees.apply(lambda row: cosine_similarity(input_array, row[['aroma', 'flavor', 'acid', 'body', 'aftertaste']].values.reshape(1, -1))[0][0], axis=1)

    # Get the indices of the top N similar items
    top_indices = recommended_coffees['similarity_factor'].nlargest(top_n).index.tolist()

    return recommended_coffees.loc[top_indices]


In [None]:
# Function to return the recommended coffee 
def recommend_coffees(test_rec, coffee_df, model, pca):
    aroma, flavor, acid, body, aftertaste = test_rec['aroma'], test_rec['flavor'], test_rec['acid'], test_rec['body'], test_rec['aftertaste']
    
    recommended_coffees = coffee_recommender(aroma, flavor, acid, body, aftertaste, model, pca, coffee_df)

    # Print only the top 5 recommendations
    for i, (_, row) in enumerate(recommended_coffees[['name', 'roaster', 'roast', 'country_of_origin', 'desc_1', 'desc_2']].iterrows(), 1):
        if i > 5:
            break
        print(f"Recommendation #{i}")
        print(f"Name: {row['name']}")
        print(f"Roaster: {row['roaster']}")
        print(f"Roast: {row['roast']}")
        print(f"Country of Origin: {row['country_of_origin']}")
        print(f"Description 1: {row['desc_1']}")
        print(f"Description 2: {row['desc_2']}")
        print("\n")

# Call the function with the test data
for index, row in coffee_test.iterrows():
    print(f"\nUser Input: {row.to_dict()}")
    recommend_coffees(row, coffee_data_df, loaded_model, loaded_pca)