In [3]:
import pandas as pd
from scipy.io import arff

# Cargar el dataset CSV
url = '/content/games.csv'
data = pd.read_csv(url)

# Guardar como archivo ARFF
with open('dataset.arff', 'w') as f:
    f.write('@RELATION dataset\n\n')

    # Escribir atributos
    for column in data.columns:
        if data[column].dtype == 'object':
            f.write(f'@ATTRIBUTE {column} STRING\n')
        else:
            f.write(f'@ATTRIBUTE {column} NUMERIC\n')

    f.write('\n@DATA\n')

    # Escribir datos
    for index, row in data.iterrows():
        f.write(','.join(map(str, row.values)) + '\n')


In [28]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder

# Cargar el dataset
url = '/content/games.csv'
df = pd.read_csv(url)

# Función para convertir el rango '0 - 20000' en su valor medio
def convert_range_to_average(value):
    if isinstance(value, str) and ' - ' in value:
        min_val, max_val = value.split(' - ')
        return (int(min_val) + int(max_val)) / 2
    return float(value)

# Aplicar la función de conversión a la columna 'Estimated owners'
df['Estimated owners'] = df['Estimated owners'].apply(convert_range_to_average)

# Inicializar el discretizador con 5 bins (intervalos)
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

# Discretizar 'Price'
df['Price_discretized'] = discretizer.fit_transform(df[['Price']])

# Discretizar 'Estimated owners'
df['Estimated_owners_discretized'] = discretizer.fit_transform(df[['Estimated owners']])

# Discretizar 'User score'
df['User_score_discretized'] = discretizer.fit_transform(df[['User score']])

# Normalizar las columnas seleccionadas
scaler = MinMaxScaler()
df[['Price_normalized', 'Estimated_owners_normalized', 'User_score_normalized']] = scaler.fit_transform(df[['Price', 'Estimated owners', 'User score']])

# Aplicar OneHotEncoder a la columna 'Price_discretized'
onehot_encoder = OneHotEncoder(sparse_output=False)
onehot_encoded = onehot_encoder.fit_transform(df[['Price_discretized']])

# Convertir el resultado OneHotEncoder a un DataFrame y agregar al original
price_onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=[f'Price_bin_{i}' for i in range(onehot_encoded.shape[1])])
df = pd.concat([df, price_onehot_encoded_df], axis=1)

# Mostrar las primeras filas del DataFrame con todos los preprocesamientos aplicados
df[['Price_discretized', 'Estimated_owners_discretized', 'User_score_discretized', 'Price_normalized', 'Estimated_owners_normalized', 'User_score_normalized'] + list(price_onehot_encoded_df.columns)].iloc[200:300]


Unnamed: 0,Price_discretized,Estimated_owners_discretized,User_score_discretized,Price_normalized,Estimated_owners_normalized,User_score_normalized,Price_bin_0,Price_bin_1,Price_bin_2
200,0.0,0.0,0.0,0.006997,0.000067,0.00,1.0,0.0,0.0
201,0.0,0.0,0.0,0.012603,0.000067,0.00,1.0,0.0,0.0
202,0.0,0.0,0.0,0.005996,0.000067,0.00,1.0,0.0,0.0
203,0.0,0.0,0.0,0.007998,0.000067,0.00,1.0,0.0,0.0
204,0.0,0.0,3.0,0.000991,0.000067,0.63,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
295,0.0,0.0,0.0,0.002993,0.002333,0.00,1.0,0.0,0.0
296,0.0,0.0,0.0,0.001992,0.000067,0.00,1.0,0.0,0.0
297,0.0,0.0,0.0,0.004995,0.000067,0.00,1.0,0.0,0.0
298,0.0,0.0,0.0,0.000000,0.000000,0.00,1.0,0.0,0.0
