In [7]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import numpy as np
import plotly.express as px
import warnings
warnings.simplefilter(action='ignore')

In [8]:
enc = OneHotEncoder(handle_unknown='ignore')


In [9]:
filename=r"C:\pythonjobs\excelfiles\data_superstore.csv"


In [10]:


dados = pd.read_csv(filename, sep=";", encoding="utf-8")



In [11]:

dados = dados.apply(lambda col: pd.to_datetime(col, errors='coerce') if col.name in ['OrderDate', 'ShipDate']
                    else col.apply(lambda k: float(k.replace(",", ".")) if isinstance(k, str) else k) if col.name in ['Sales', 'Profit', 'Discount']
                    else col.astype(int) if col.name == 'Quantity'
                    else col)



def generate_customer_ids(df):
    unique_customers = df['CustomerName'].unique()
    customer_id_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
    df['CustomerID'] = df['CustomerName'].map(customer_id_mapping)
    return df


def check_and_convert_types(*column_names):
    def decorator(func):
        def wrapper(*args, **kwargs):
            result = func(*args, **kwargs)
            if isinstance(result, pd.DataFrame):
                for col_name in column_names:
                    if col_name not in result.columns:
                        continue
                    col_type = result[col_name].dtype
                    if col_type == 'object' and col_name != 'CustomerName':
                        encoder = OneHotEncoder(sparse=False, drop='first')
                        encoded_cols = encoder.fit_transform(result[[col_name]])
                        encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out([col_name]))
                        result = pd.concat([result, encoded_df], axis=1)
                        result.drop(columns=[col_name], inplace=True)
                    if result[col_name].isna().any():
                        mean_value = np.nanmean(result[col_name])
                        result[col_name].fillna(mean_value, inplace=True)
                        print(result)
            return result
        return wrapper
    return decorator


def apply_scaling_and_clustering(func):
    def wrapper(*args, **kwargs):
        df = func(*args, **kwargs)

        scaler = StandardScaler()
        scaled_data = scaler.fit_transform(df[['Sales', 'Quantity']])  
        df[['Sales', 'Quantity']] = scaled_data

        kmeans = KMeans(n_clusters=3)
        df['Cluster'] = kmeans.fit_predict(scaled_data)

        return df

    return wrapper



def generate_customer_ids(df):
    unique_customers = df['CustomerName'].unique()
    customer_id_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
    df['CustomerID'] = df['CustomerName'].map(customer_id_mapping)
    return df


def add_customer_ids(func):
    def wrapper(*args, **kwargs):
        df = func(*args, **kwargs)
        df = generate_customer_ids(df)
        return df
    return wrapper


def convert_and_add_customer_ids(df):
    
    columns_to_encode = ['City', 'Category', 'Quantity', 'Sales', 'CustomerCount', 'CustomerName']
    encoder = OneHotEncoder(sparse=False, drop='first')
    encoded_cols = encoder.fit_transform(df[columns_to_encode])
    encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(columns_to_encode))
    
  
    df = pd.concat([df, encoded_df], axis=1)
    
    return df


def generate_aggregated_data(df):

    return df



def generate_customer_ids(df):
    unique_customers = df['CustomerName'].unique()
    customer_id_mapping = {customer: idx for idx, customer in enumerate(unique_customers)}
    df['CustomerID'] = df['CustomerName'].map(customer_id_mapping)
    return df


def add_customer_ids(func):
    def wrapper(*args, **kwargs):
        df = func(*args, **kwargs)
        df = generate_customer_ids(df)
        return df
    return wrapper


@add_customer_ids
@apply_scaling_and_clustering
def generate_data_with_customer_ids(df):
    return generate_aggregated_data(df)


result_data = generate_data_with_customer_ids(dados.copy())


result_data[["CustomerID","Sales","Quantity","Cluster"]]



Unnamed: 0,CustomerID,Sales,Quantity,Cluster
0,0,0.051510,-0.804303,0
1,0,0.805633,-0.354865,0
2,1,-0.345368,-0.804303,0
3,2,1.167688,0.544012,2
4,2,-0.332935,-0.804303,0
...,...,...,...,...
9989,483,-0.328314,-0.354865,0
9990,72,-0.221269,-0.804303,0
9991,72,0.046080,-0.804303,0
9992,72,-0.321331,0.094574,0


In [12]:

fig = px.scatter(result_data, x='Sales', y='Quantity', color='Cluster', title='Kmeans',
                 labels={'Sales': 'Vendas', 'Quantity': 'Quantidade'})


fig.update_layout(
    paper_bgcolor='black',  
    plot_bgcolor='black',   
    font=dict(color='lime'),  
    title=dict(font=dict(color='cyan')),  
    coloraxis_colorbar=dict(outlinewidth=0, tickfont=dict(color='lime')),  
    legend=dict(font=dict(color='lime')),  
)


colors = ['#FF69B4', '#00FF00', '#00FFFF'] 
fig.update_traces(marker=dict(size=12, opacity=0.8), selector=dict(mode='markers'))


fig.show()

