In [None]:
# Import dependencies.
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
prices_df = pd.read_csv(Path('../resources/prices_sorted_type_geo_yr_wk.csv'))
prices_df.head()

In [None]:
print(prices_df.shape)

In [None]:
prices_df.columns

# Pre-Processing

In [None]:
# finding null values
def findNull(values):
    print(values.shape)
    index = 0
    for j in values:
        if not np.isfinite(j):
            print(index, j)
        index +=1

In [None]:
# finding null values
for col in prices_df.columns:
    print(f"Column {col} has {prices_df[col].isnull().sum()} null values")
    

In [None]:
prices_df.count()

In [None]:
prices_df.isnull()

In [None]:
prices_df = prices_df.dropna()

In [None]:
for col in prices_df.columns:
    plt.scatter(prices_df[col], prices_df['price_inc'])
    plt.xlabel(col)
    plt.ylabel('price_inc')
    plt.show()

In [None]:
prices_df.count()

## Duplicate check

In [None]:
#pre_prices_df = prices_df.copy()

# y 0 1 2 3 values
#0 for val in y if val >= 0.44 and val < 1.14
#1 for val in y if val >= 1.14 and val < 1.37 
#2 for val in y if val >= 1.37 and val < 1.63
#3 for val in y if val >=1.63 

# ratios 4046_units/total_volume  4225_units/total_volume  4770_units/total_volume  
# ratios s_bags/total_bags  l_bags/total_bags  xl_bags/total_bags

# Establish the bins.
#price_bins = [0.44, 1.14, 1.37, 1.63, 3.17]
#group_names = ["0", "1", "2", "3" ]

print(f"Duplicated entries {prices_df.duplicated().sum()}")
# Categorize prices for NB.
#prices_df["price_cat"] = pd.cut(prices_df['avg_price'], price_bins, labels=group_names)

prices_df.head()

## Transforming string cols

In [None]:
# type con organic
def change_type_string(str):
    if str == "conventional":
        return 1
    else:
        return 0
    
prices_df['type'] = prices_df['type'].apply(change_type_string)

prices_df.head()

## Creating ratios 

In [None]:
prices_df["4046_ratio_units"] = prices_df['4046_units']/prices_df['total_volume']
prices_df["4225_ratio_units"] = prices_df['4225_units']/prices_df['total_volume']
prices_df["4770_ratio_units"] = prices_df['4770_units']/prices_df['total_volume']
prices_df["total_ratio_bags"] = prices_df['total_bags']/prices_df['total_volume']

prices_df["s_ratio_bags"] = prices_df['s_bags']/prices_df['total_bags']
prices_df["l_ratio_bags"] = prices_df['l_bags']/prices_df['total_bags']
prices_df["xl_ratio_bags"] = prices_df['xl_bags']/prices_df['total_bags']


prices_df.head()

## Creating percents 

In [None]:
prices_df["4046_per_units"] = 100*prices_df['4046_units']/prices_df['total_volume']
prices_df["4225_per_units"] = 100*prices_df['4225_units']/prices_df['total_volume']
prices_df["4770_per_units"] = 100*prices_df['4770_units']/prices_df['total_volume']
prices_df["total_per_bags"] = 100*prices_df['total_bags']/prices_df['total_volume']

prices_df["s_per_bags"] = 100*prices_df['s_bags']/prices_df['total_bags']
prices_df["l_per_bags"] = 100*prices_df['l_bags']/prices_df['total_bags']
prices_df["xl_per_bags"] = 100*prices_df['xl_bags']/prices_df['total_bags']

prices_df.head()

## create variables for text features 

In [None]:
prices_ml_df = pd.get_dummies(data=prices_df, columns=['year_month', 'geography', 'date', 'type'])
prices_ml_df.head()

## Dropping cols

In [None]:
#prices_df = prices_df.drop(columns=['timeframe'], axis=1)
prices_nd_df = prices_df.drop(columns=['timeframe'], axis=1)
prices_ml_nd_df = pd.get_dummies(data=prices_nd_df, columns=['year_month', 'geography', 'date', 'type'])


prices_ml_nd_df.head()

In [None]:
prices_ml_df = prices_ml_df.drop(columns=['timeframe', 'total_volume', '4046_units', '4225_units', '4770_units',
                                       'total_bags', 's_bags', 'l_bags', 'xl_bags'], axis=1)
prices_ml_df.head()

In [None]:
prices_df.count()

In [None]:
prices_df = prices_df.dropna()
prices_df.count()

In [None]:
prices_ml_df.columns

In [None]:
print(prices_ml_df.shape)

In [None]:
prices_ml_df_copy = prices_ml_df.copy()
prices_ml_df.head()

In [None]:
#prices_ml_df.plot()
#plt.show()

for col in prices_ml_df.columns:
    plt.scatter(prices_ml_df[col], prices_ml_df['price_inc'])
    plt.xlabel(col)
    plt.ylabel('price_inc')
    plt.show()

## K Means 

In [None]:
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans

In [None]:
# Initializing model with K = 2 (since we already know there are 2 classes increase or not)
model = KMeans(n_clusters=2, random_state=5)
model

In [None]:
# Fitting model
model.fit(prices_ml_df)

In [None]:
# Get the predictions
predictions = model.predict(prices_ml_df)
print(predictions)

In [None]:
# Add a new class column to the dataset
prices_ml_df["class"] = model.labels_
prices_ml_df.head()

## Visualize the Results

In [None]:
import plotly.express as px
import hvplot.pandas

# Create a scatterplot of df_iris
prices_ml_df.hvplot.scatter(x="4046_ratio_units", y="4225_ratio_units", by="class")



In [None]:
# Plotting the clusters with three features
fig = px.scatter_3d(prices_ml_df, x="4046_ratio_units", y="4225_ratio_units", z="4770_ratio_units", color="class", symbol="class", size="avg_price",width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [None]:
# plotting with two diff features
prices_ml_df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags")

In [None]:
# Function to cluster and plot dataset
def test_cluster_amount(df, clusters):
    model = KMeans(n_clusters=clusters, random_state=5)
    # Fitting model
    model.fit(df)
    # Add a new class column to df_iris
    df["class"] = model.labels_
    

In [None]:
df = prices_ml_df_copy.copy()
test_cluster_amount(df, 2)
df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")

In [None]:
fig = px.scatter_3d(df, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
df = prices_ml_df_copy.copy()
test_cluster_amount(df, 3)
df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")

In [None]:
fig = px.scatter_3d(df, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
df = prices_ml_df_copy.copy()
test_cluster_amount(df, 4)
df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")

In [None]:
fig = px.scatter_3d(df, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
df = prices_ml_df_copy.copy()
test_cluster_amount(df, 5)
df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")

In [None]:
fig = px.scatter_3d(df, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
df = prices_ml_df_copy.copy()
test_cluster_amount(df, 6)
df.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")

In [None]:
fig = px.scatter_3d(df, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## Elbow plots 

In [None]:
inertia = []
k = list(range(1, 11))
df = prices_ml_df_copy.copy()

In [None]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df)
    inertia.append(km.inertia_)

In [None]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

### K is 6 or 7 

In [None]:
def get_clusters(k, data):
    # Create a copy of the DataFrame
    data = data.copy()
    
    # Initialize the K-Means model
    model = KMeans(n_clusters=k, random_state=0)
    # Fit the model
    model.fit(data)
    # Predict clusters
    predictions = model.predict(data)
    
    # Create return DataFrame with predicted clusters
    data["class"] = model.labels_
    
    return data


In [None]:
df = prices_ml_df_copy.copy()
df_6 = get_clusters(6, df)
df_6.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")




In [None]:
fig = px.scatter_3d(df_6, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

In [None]:
df = prices_ml_df_copy.copy()
df_7 = get_clusters(7, df)
df_7.hvplot.scatter(x="4770_ratio_units", y="total_ratio_bags", by="class")



In [None]:
fig = px.scatter_3d(df_7, x="4770_ratio_units", y="total_ratio_bags", z="avg_price",
                    color="class", symbol="class", width=800)
fig.update_layout(legend=dict(x=0, y=1))
fig.show()

## PCA - Principal Component Analysis

In [None]:
prices_ml_nd_df.head()

In [None]:
prices_ml_nd_df.columns

In [None]:
# Transform annual income
prices_ml_nd_df["4046_units"] = prices_ml_nd_df["4046_units"] / 100
prices_ml_nd_df["4225_units"] = prices_ml_nd_df["4225_units"] / 1000
prices_ml_nd_df["4770_units"] = prices_ml_nd_df["4770_units"] / 10
prices_ml_nd_df["total_bags"] = prices_ml_nd_df["total_bags"] / 100
prices_ml_nd_df["s_bags"] = prices_ml_nd_df["s_bags"] / 100

prices_ml_nd_df = prices_ml_nd_df.drop(columns=['total_volume'], axis=1)

prices_ml_nd_df.head()

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import hvplot.pandas

In [None]:
# Initialize PCA model
pca = PCA(n_components=2)
prices_scaled = StandardScaler().fit_transform(prices_ml_nd_df)
print(prices_scaled[0:5])

In [None]:
# Get two principal components for the data.
prices_pca = pca.fit_transform(prices_scaled)

In [None]:
# Transform PCA data to a dataframe
df_prices_pca = pd.DataFrame(data=prices_pca, columns=["principal component 1", "principal component 2"])
df_prices_pca.head()

In [None]:
# Find the best value for K
inertia = []
k = list(range(1, 11))

# Calculate the inertia for the range of K values
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(df_prices_pca)
    inertia.append(km.inertia_)

# Create the elbow curve
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", xticks=k, title="Elbow Curve")

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=3, random_state=0)

# Fit the model
model.fit(df_prices_pca)

# Predict clusters
predictions = model.predict(df_prices_pca)

# Add the predicted class columns
df_prices_pca["class"] = model.labels_
df_prices_pca.head()

In [None]:
df_prices_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

In [None]:
# Initialize the K-means model
model = KMeans(n_clusters=6, random_state=0)

# Fit the model
model.fit(df_prices_pca)

# Predict clusters
predictions = model.predict(df_prices_pca)

# Add the predicted class columns
df_prices_pca["class"] = model.labels_
df_prices_pca.head()

In [None]:
df_prices_pca.hvplot.scatter(
    x="principal component 1",
    y="principal component 2",
    hover_cols=["class"],
    by="class",
)

## Dendogram

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

import plotly.figure_factory as ff

# Create the dendrogram
fig = ff.create_dendrogram(df_prices_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_prices_pca)

# Add the predicted class columns
df_prices_pca["class"] = model.labels_
df_prices_pca.head()

df_prices_pca.hvplot.scatter(
        x="principal component 1",
        y="principal component 2",
        hover_cols=["class"],
        by="class",
    )


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import AgglomerativeClustering
import hvplot.pandas

import plotly.figure_factory as ff

# Create the dendrogram
fig = ff.create_dendrogram(df_prices_pca, color_threshold=0)
fig.update_layout(width=800, height=500)
fig.show()

agg = AgglomerativeClustering(n_clusters=3)
model = agg.fit(df_prices_pca)

# Add the predicted class columns
df_prices_pca["class"] = model.labels_
df_prices_pca.head()

df_prices_pca.hvplot.scatter(
        x="principal component 1",
        y="principal component 2",
        hover_cols=["class"],
        by="class",
    )