### INSTALAÇÕES

In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


> <hr>

### IMPORTAÇÕES

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt  
import sklearn
from ucimlrepo import fetch_ucirepo 
from fcmeans import FCM
from pandas.api.types import is_numeric_dtype

> <hr>

### DATASET

In [2]:
# fetch dataset 
iris = fetch_ucirepo(id=53) 
  
# data (as pandas dataframes) 
X = iris.data.features 
y = iris.data.targets 
  
# metadata 
print(iris.metadata) 
# variable information 
print(iris.variables) 
print()
print(iris.variables)
print()
print(iris.feature_type)

{'uci_id': 53, 'name': 'Iris', 'repository_url': 'https://archive.ics.uci.edu/dataset/53/iris', 'data_url': 'https://archive.ics.uci.edu/static/public/53/data.csv', 'abstract': 'A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.\n', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 150, 'num_features': 4, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1936, 'last_updated': 'Tue Sep 12 2023', 'dataset_doi': '10.24432/C56C76', 'creators': ['R. A. Fisher'], 'intro_paper': {'title': 'The Iris data set: In search of the source of virginica', 'authors': 'A. Unwin, K. Kleinman', 'published_in': 'Significance, 2021', 'year': 2021, 'url': 'https://www.semanticscholar.org/paper/4599862ea877863669a6a8e63a3c707a787d5d7e', 'doi': '1740-9713.01589'}, 'add

In [3]:
print(iris.metadata)

# Assuming metadata contains 'feature_names', use it
feature_names = iris.metadata.get('feature_names', [f'feature_{i}' for i in range(iris.data.features.shape[1])])

# Create a DataFrame with feature names
iris_df = pd.DataFrame(data=iris.data.features, columns=feature_names)
iris_df['target'] = iris.data.targets

# Display the DataFrame
print(iris_df.head())
print()

{'uci_id': 53, 'name': 'Iris', 'repository_url': 'https://archive.ics.uci.edu/dataset/53/iris', 'data_url': 'https://archive.ics.uci.edu/static/public/53/data.csv', 'abstract': 'A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.\n', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 150, 'num_features': 4, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1936, 'last_updated': 'Tue Sep 12 2023', 'dataset_doi': '10.24432/C56C76', 'creators': ['R. A. Fisher'], 'intro_paper': {'title': 'The Iris data set: In search of the source of virginica', 'authors': 'A. Unwin, K. Kleinman', 'published_in': 'Significance, 2021', 'year': 2021, 'url': 'https://www.semanticscholar.org/paper/4599862ea877863669a6a8e63a3c707a787d5d7e', 'doi': '1740-9713.01589'}, 'add

In [4]:
# Substitua 'caminho/do/seu/iris.data' pelo caminho real do seu arquivo Iris.data e 'iris.csv' pelo nome desejado para o arquivo CSV de saída.
input_file = 'C:\\JupyterLab\\PAPL-UFPE\\ML-Fuzzy\\Testes\\iris.data'
output_file = 'iris.csv'

# Carregue o arquivo no pandas
data = pd.read_csv(input_file, header=None, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])

# Salve o dataframe como CSV
data.to_csv(output_file, index=False)

In [5]:
data.shape #linhas, colunas

(150, 5)

In [6]:
data.head # 5 primeiras linhas

<bound method NDFrame.head of      sepal_length  sepal_width  petal_length  petal_width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]>

In [27]:
data.tail # 5 últimas linhas

<bound method NDFrame.tail of      sepal_length  sepal_width  petal_length  petal_width           class
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            ...          ...           ...          ...             ...
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]>

In [28]:
(data.dtypes).value_counts()

float64    4
object     1
Name: count, dtype: int64

In [29]:
# normalização
for index in data.columns: # em suma, para as colunas do DataFrame, vamos analisar se os dados são numéricos
    # se os dados forem numéricos, eles serão normalizados
    if is_numeric_dtype(data[index][0]): # retomando uma função do pandas
        data[index] = data[index]/max(data[index])

In [30]:
data.head

<bound method NDFrame.head of      sepal_length  sepal_width  petal_length  petal_width           class
0        0.645570     0.795455      0.202899         0.08     Iris-setosa
1        0.620253     0.681818      0.202899         0.08     Iris-setosa
2        0.594937     0.727273      0.188406         0.08     Iris-setosa
3        0.582278     0.704545      0.217391         0.08     Iris-setosa
4        0.632911     0.818182      0.202899         0.08     Iris-setosa
..            ...          ...           ...          ...             ...
145      0.848101     0.681818      0.753623         0.92  Iris-virginica
146      0.797468     0.568182      0.724638         0.76  Iris-virginica
147      0.822785     0.681818      0.753623         0.80  Iris-virginica
148      0.784810     0.772727      0.782609         0.92  Iris-virginica
149      0.746835     0.681818      0.739130         0.72  Iris-virginica

[150 rows x 5 columns]>

In [31]:
print(data['class'].value_counts()) # class = species

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


> <hr>

### TREINO E TESTE

'''
    from sklearn.model_selection import train_test_split

    tamanho_teste = 0.3

    indices = iris_df.index
    indices_treino, indices_teste = train_test_split(
        indices, test_size= tamanho_teste
    )

    df_treino = iris_df.loc[indices_treino]
    df_teste = iris_df.loc[indices_teste]
'''

> <hr>

### FUZZY C-MEANS

### REFERÊNCIAS

'''

@software{dias2019fuzzy,
  <br>author       = {Madson Luiz Dantas Dias},
  <br>title        = {fuzzy-c-means: An implementation of Fuzzy $C$-means clustering algorithm.},
  <br>month        = may,
  <br>year         = 2019,
  <br>publisher    = {Zenodo},
  <br>doi          = {10.5281/zenodo.3066222},
  <br>url          = {https://git.io/fuzzy-c-means}
<br>}

'''

In [2]:
#testes
import tqdm
from typing import Optional, Dict, Union, Callable
from enum import Enum
from joblib import Parallel, delayed
from numpy.typing import NDArray
from pydantic import BaseModel, Extra, Field, validate_arguments

In [4]:
#--install-completion

In [3]:
class DistanceOptions(str, Enum):
    euclidean = 'euclidean' # se eu n me engano precisa ser gaussiano né
    #minkowski = 'minkowski'
    #cosine = 'cosine'

class FCM(BaseModel):
    r"""Fuzzy C-means Model

    Attributes:
        n_clusters (int): The number of clusters to form as well as the number
        of centroids to generate by the fuzzy C-means.
        max_iter (int): Maximum number of iterations of the fuzzy C-means
        algorithm for a single run.
        m (float): Degree of fuzziness: $m \in (1, \infty)$.
        error (float): Relative tolerance with regards to Frobenius norm of
        the difference
        in the cluster centers of two consecutive iterations to declare
        convergence.
        random_state (Optional[int]): Determines random number generation for
        centroid initialization.
        Use an int to make the randomness deterministic.
        trained (bool): Variable to store whether or not the model has been
        trained.

    Returns:
        FCM: A FCM model.

    Raises:
        ReferenceError: If called without the model being trained
    """

    class Config:
        extra = Extra.allow
        arbitrary_types_allowed = True

    n_clusters: int = Field(3, ge=1)
    max_iter: int = Field(150, ge=1, le=1000)
    m: float = Field(2.0, ge=1.0)
    error: float = Field(1e-5, ge=1e-9)
    random_state: Optional[int] = None
    trained: bool = Field(False, const=True)
    n_jobs: int = Field(1, ge=1)
    verbose: Optional[bool] = False
    distance: Optional[Union[DistanceOptions, Callable]] = DistanceOptions.euclidean
    distance_params: Optional[Dict] = {}

    @validate_arguments(config=dict(arbitrary_types_allowed=True))
    def fit(self, X: NDArray) -> None:
        """Train the fuzzy-c-means model

        Args:
            X (NDArray): Training instances to cluster.
        """
        self.rng = np.random.default_rng(self.random_state)
        n_samples = X.shape[0]
        self.u = self.rng.uniform(size=(n_samples, self.n_clusters))
        self.u = self.u / np.tile(self.u.sum(axis=1)[np.newaxis].T, self.n_clusters)
        for _ in tqdm.tqdm(
            range(self.max_iter), desc="Training", disable=not self.verbose
        ):
            u_old = self.u.copy()
            self._centers = FCM._next_centers(X, self.u, self.m)
            self.u = self.soft_predict(X)
            # Stopping rule
            if np.linalg.norm(self.u - u_old) < self.error:
                break
        self.trained = True
        
    @validate_arguments(config=dict(arbitrary_types_allowed=True))
    def soft_predict(self, X: NDArray) -> NDArray:
        """Soft predict of FCM

        Args:
            X (NDArray): New data to predict.

        Returns:
            NDArray: Fuzzy partition array, returned as an array with
            n_samples rows and n_clusters columns.
        """
        temp = FCM._dist(X, self._centers, self.distance, self.distance_params) ** (2 / (self.m - 1))
        u_dist = Parallel(n_jobs=self.n_jobs)(
            delayed(lambda data, col: (data[:, col] / data.T).sum(0))(temp, col)
            for col in range(temp.shape[1])
        )
        u_dist = np.vstack(u_dist).T
        return 1 / u_dist

    @validate_arguments(config=dict(arbitrary_types_allowed=True))
    def predict(self, X: NDArray) -> NDArray:
        """Predict the closest cluster each sample in X belongs to.

        Args:
            X (NDArray): New data to predict.

        Raises:
            ReferenceError: If it called without the model being trained.

        Returns:
            NDArray: Index of the cluster each sample belongs to.
        """
        if self._is_trained():
            X = np.expand_dims(X, axis=0) if len(X.shape) == 1 else X
            return self.soft_predict(X).argmax(axis=-1)
        raise ReferenceError(
            "You need to train the model. Run `.fit()` method to this."
        )

    def _is_trained(self) -> bool:
        if self.trained:
            return True
        return False

    @staticmethod
    def _dist(A: NDArray, B: NDArray, distance: str, distance_params: str) -> NDArray:
        """Compute the distance between two matrices"""
        if isinstance(distance, Callable):
            return distance(A, B, distance_params)
        elif distance == 'minkowski':
            return FCM._minkowski(A, B, distance_params.get("p", 1.0))
        elif distance == 'cosine':
            return FCM._cosine_similarity(A, B)
        else:
            return FCM._euclidean(A, B)
    
    @staticmethod
    def _euclidean(A: NDArray, B: NDArray) -> NDArray:
        """Compute the euclidean distance between two matrices"""
        return np.sqrt(np.einsum("ijk->ij", (A[:, None, :] - B) ** 2))

    @staticmethod
    def _minkowski(A: NDArray, B: NDArray, p: float) -> NDArray:
        """Compute the minkowski distance between two matrices"""
        return (np.einsum("ijk->ij", (A[:, None, :] - B) ** p)) ** (1/p)
    
    @staticmethod
    def _cosine_similarity(A: NDArray, B: NDArray) -> NDArray:
        """Compute the cosine similarity between two matrices"""
        p1 = np.sqrt(np.sum(A**2,axis=1))[:,np.newaxis]
        p2 = np.sqrt(np.sum(B**2,axis=1))[np.newaxis,:]
        return np.dot(A,B.T) / (p1*p2)

    @staticmethod
    def _next_centers(X: NDArray, u: NDArray, m: float):
        """Update cluster centers"""
        um = u**m
        return (X.T @ um / np.sum(um, axis=0)).T

    @property
    def centers(self) -> NDArray:
        if self._is_trained():
            return self._centers
        raise ReferenceError(
            "You need to train the model. Run `.fit()` method to this."
        )

    @property
    def partition_coefficient(self) -> float:
        """Partition coefficient

        Equation 12a of
        [this paper](https://doi.org/10.1016/0098-3004(84)90020-7).
        """
        if self._is_trained():
            return np.mean(self.u**2)
        raise ReferenceError(
            "You need to train the model. Run `.fit()` method to this."
        )

    @property
    def partition_entropy_coefficient(self):
        if self._is_trained():
            return -np.mean(self.u * np.log2(self.u))
        raise ReferenceError(
            "You need to train the model. Run `.fit()` method to this."
     

In [None]:
# o coeficiente de partição usado pela biblioteca não corresponde ao que tem no artigo do Prof., as equações são diferentes
        # tipo realmente bem diferentes mesmo


<hr>

### REFERÊNCIAS

@software{dias2019fuzzy,
  author       = {Madson Luiz Dantas Dias},
  title        = {fuzzy-c-means: An implementation of Fuzzy $C$-means clustering algorithm.},
  month        = may,
  year         = 2019,
  publisher    = {Zenodo},
  doi          = {10.5281/zenodo.3066222},
  url          = {https://git.io/fuzzy-c-means}
}