# 1.

In [None]:
import matplotlib.pyplot as plt
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import euclidean_distances
from sklearn_extra.cluster import KMedoids
import seaborn as sns

# Scale the data using min-max scaler
from sklearn.preprocessing import MinMaxScaler
#import sillhouette score
from sklearn.metrics import silhouette_score

In [None]:
#Cargar y verificar integridad de los datos. Realizar curación si corresponde.
data = pd.read_csv('Pokemon.csv')
data.head()


In [None]:
#some values in type 2 are missing, so we will fill them with the value 'None'
data['Type 2'].fillna('None', inplace=True)

# 2.

In [None]:
#Graficar matriz de correlación y realizar un scatter_matrix con plotly.

fig = px.scatter_matrix(data, dimensions=['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed'], color='Legendary')
fig.show()


In [None]:
'''sns.pairplot(data, hue='Legendary')
plt.show()'''

In [None]:
#total vs legendary, boxplot
fig = px.box(data, x='Legendary', y='Total')
fig.show()


In [None]:
#total of only legendaries vs generation, scatter plot
fig = px.scatter(data, x='Generation', y='Total', color='Legendary')
#average line for total
data_common = data[data['Legendary'] == False]
fig.add_scatter(x=np.arange(1, 7), y=[data_common[data_common['Generation'] == i]['Total'].mean() for i in range(1, 7)], mode='lines', name='Average Total')
data_legendary = data[data['Legendary'] == True]
fig.add_scatter(x=np.arange(1, 7), y=[data_legendary[data_legendary['Generation'] == i]['Total'].mean() for i in range(1, 7)], mode='lines', name='Average Total Legendary')

fig.show()



# K-MEAN

In [None]:
#K-means clustering
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
n_types = data['Type 1'].nunique()
n_types

In [None]:
data_copy = data.copy()
#data_copy = data_copy[data_copy['Legendary'] == False]
data_new = data_copy.drop(['#','Name', 'Generation', 'Legendary', 'Total', 'Type 1', 'Type 2'], axis=1)#, 'Type 1', 'Type 2', 'Generation', 'Legendary', 'Total'], axis=1)

categorial_features = data_new.select_dtypes(include=['object']).columns
data_scaled = pd.get_dummies(data_new, columns=categorial_features)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_scaled)

In [None]:

distances_list = []
sillhouette_list = []

K = 80
for j in range(2, K):
    kmeans = KMeans(n_clusters=j, random_state=42)
    data_new['Cluster'] = kmeans.fit_predict(data_scaled)
    
    total_distance = 0
    for i in range(j):
        cluster_points = data_scaled[data_new['Cluster'] == i]
        cluster_center = kmeans.cluster_centers_[i]
        total_distance += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1))

    
    #calculate sillhouette
    sillhouette_list.append(silhouette_score(data_scaled, data_new['Cluster']))
    
    mean_distance = total_distance / data.shape[0]
    distances_list.append(mean_distance)


In [None]:
#plot elbow method

plt.plot(np.arange(2,K), distances_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.xticks(np.arange(1,K,5))
plt.grid()
plt.title('Elbow method')
plt.show()


In [None]:
#plot sillhouette
plt.plot(np.arange(2,K), sillhouette_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.xticks(np.arange(1,K,5))
plt.grid()
plt.show()

In [None]:
data_copy = data.copy()
#data_copy = data_copy[data_copy['Legendary'] == False]
data_new = data_copy.drop(['#','Name', 'Generation', 'Legendary', 'Total', 'Type 1', 'Type 2'], axis=1)#, 'Type 1', 'Type 2', 'Generation', 'Legendary', 'Total'], axis=1)

categorial_features = data_new.select_dtypes(include=['object']).columns
data_scaled = pd.get_dummies(data_new, columns=categorial_features)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_scaled)

In [None]:
# Set the number of clusters to 10
k = 10

# Perform KMeans clustering
kmeans = KMeans(n_clusters=k, random_state=42)
data_new['Cluster'] = kmeans.fit_predict(data_scaled)

In [None]:
import plotly.graph_objects as go

In [None]:
fig = go.Figure()
clusters = data_new['Cluster'].unique()

for cluster in clusters:
    cluster_data = data_new[data_new['Cluster'] == cluster].drop('Cluster', axis=1)
    radar_data = cluster_data.mean().reset_index()
    radar_data.columns = ['Stats', 'Values']
    fig.add_trace(go.Scatterpolar(
        r=radar_data['Values'],
        theta=radar_data['Stats'],
        fill='toself',
        name=f'Cluster {cluster}'
    ))

    

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 220]
    )),
  showlegend=False
)

fig.show()

In [None]:
clusters = data_new['Cluster'].unique()

for cluster in clusters:
    cluster_data = data_new[data_new['Cluster'] == cluster].drop('Cluster', axis=1)
    
    # Prepare data for radar graph
    radar_data = cluster_data.mean().reset_index()
    radar_data.columns = ['theta', 'r']
    
    # Create radar graph
    fig = px.line_polar(radar_data, r='r', theta='theta', line_close=True, range_r=[0, 220],
                        title=f'Cluster {cluster} Radar Graph, {cluster_data.shape[0]} pokemons')
    
    # Show plot
    fig.show()


In [None]:
#Realizar un grafico scatter_matrix con plotly usando como parámetro color la variable clústeres.

fig = sns.pairplot(data_new, hue='Cluster')
plt.show()

# K-MEDIOD

In [None]:
data_copy = data.copy()
#data_copy = data_copy[data_copy['Legendary'] == False]
data_new = data_copy.drop(['#','Name', 'Generation', 'Legendary', 'Total', 'Type 1', 'Type 2'], axis=1)#, 'Type 1', 'Type 2', 'Generation', 'Legendary', 'Total'], axis=1)

categorial_features = data_new.select_dtypes(include=['object']).columns
data_scaled = pd.get_dummies(data_new, columns=categorial_features)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_scaled)

In [None]:
K = 20

distances_list = []
sillhouette_list = []

for j in range(2, K):
    kmediod = KMedoids(n_clusters=j, random_state=42)
    data_new['Cluster'] = kmediod.fit_predict(data_scaled)
    
    total_distance = 0
    for i in range(j):
        cluster_points = data_scaled[data_new['Cluster'] == i]
        cluster_center = kmediod.cluster_centers_[i]
        total_distance += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1))**2

    
    #calculate sillhouette
    sillhouette_list.append(silhouette_score(data_scaled, data_new['Cluster']))
    
    mean_distance = total_distance / data_new.shape[0]
    distances_list.append(mean_distance)

In [None]:
#plot elbow method
plt.plot(np.arange(2,K), distances_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.title('Elbow method')
plt.show()


In [None]:
#plot sillhouette
plt.plot(np.arange(2,K), sillhouette_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.xticks(np.arange(1,K,5))
plt.grid()
plt.show()

In [None]:
fig = go.Figure()
clusters = data_new['Cluster'].unique()

for cluster in clusters:
    cluster_data = data_new[data_new['Cluster'] == cluster].drop('Cluster', axis=1)
    radar_data = cluster_data.mean().reset_index()
    radar_data.columns = ['Stats', 'Values']
    fig.add_trace(go.Scatterpolar(
        r=radar_data['Values'],
        theta=radar_data['Stats'],
        fill='toself',
        name=f'Cluster {cluster}'
    ))

    

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 220]
    )),
  showlegend=False
)

fig.show()

# K-MEDIAN

In [None]:
data_copy = data.copy()
#data_copy = data_copy[data_copy['Legendary'] == False]
data_new = data_copy.drop(['#','Name', 'Generation', 'Legendary', 'Total', 'Type 1', 'Type 2'], axis=1)#, 'Type 1', 'Type 2', 'Generation', 'Legendary', 'Total'], axis=1)

categorial_features = data_new.select_dtypes(include=['object']).columns
data_scaled = pd.get_dummies(data_new, columns=categorial_features)
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data_scaled)

In [None]:
K = 20

distances_list = []
sillhouette_list = []

for j in range(2, K):
    kmediod = KMedoids(n_clusters=j, random_state=42)
    data_new['Cluster'] = kmediod.fit_predict(data_scaled)
    
    total_distance = 0
    for i in range(j):
        cluster_points = data_scaled[data_new['Cluster'] == i]
        cluster_center = kmediod.cluster_centers_[i]
        total_distance += np.sum(np.linalg.norm(cluster_points - cluster_center, axis=1))

    
    #calculate sillhouette
    sillhouette_list.append(silhouette_score(data_scaled, data_new['Cluster']))
    
    mean_distance = total_distance / data_new.shape[0]
    distances_list.append(mean_distance)

In [None]:
#plot elbow method
plt.plot(np.arange(2,20), distances_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.title('Elbow method')
plt.show()

In [None]:
#plot sillhouette
plt.plot(np.arange(2,K), sillhouette_list)
plt.xlabel('Number of clusters')
plt.ylabel('Intracluster distance')
plt.xticks(np.arange(1,K,5))
plt.grid()
plt.show()

In [None]:
fig = go.Figure()
clusters = data_new['Cluster'].unique()

for cluster in clusters:
    cluster_data = data_new[data_new['Cluster'] == cluster].drop('Cluster', axis=1)
    radar_data = cluster_data.mean().reset_index()
    radar_data.columns = ['Stats', 'Values']
    fig.add_trace(go.Scatterpolar(
        r=radar_data['Values'],
        theta=radar_data['Stats'],
        fill='toself',
        name=f'Cluster {cluster}'
    ))

    

fig.update_layout(
  polar=dict(
    radialaxis=dict(
      visible=True,
      range=[0, 220]
    )),
  showlegend=False
)

fig.show()