In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


url ="https://raw.githubusercontent.com/facstyle/DataScience_TP/refs/heads/main/dataset.csv"

df = pd.read_csv(url, sep=';')

df.head()

Unnamed: 0,company,sector,horizon (days),amount,date_BUY_fix,date_SELL_fix,price_BUY,price_SELL,Volatility_Buy,Volatility_sell,...,investment,ESG_ranking,PE_ratio,EPS_ratio,PS_ratio,PB_ratio,NetProfitMargin_ratio,current_ratio,roa_ratio,roe_ratio
0,BBY,RETAIL,2,100,25/5/2017,26/5/2017,5.555.180.358.886.710,5.348.391.342.163.080,0.383666,0.385748,...,BAD,12.0,12.58,3.73,0.38,3.19,3.01,1.49,8.69,26.69
1,BAC,BANK,330,15000,22/11/2016,18/10/2017,18.616.748.809.814.400,2.465.447.235.107.420,0.322809,0.23635,...,GOOD,26.3,11.39,1.26,1.71,0.54,15.7,0.92,0.67,5.54
2,AXP,BANK,7,3000,27/9/2016,4/10/2016,5.986.229.705.810.540,5.951.772.689.819.330,0.238642,0.235491,...,BAD,19.8,10.58,5.64,1.67,2.6,15.68,1.91,3.39,25.78
3,KSS,RETAIL,5,20000,11/10/2016,17/10/2016,3.821.672.439.575.190,35.985.328.674.316.400,0.428559,0.42934,...,BAD,12.9,11.09,3.27,0.36,1.25,3.17,1.6,4.41,11.35
4,JPM,BANK,360,15000,12/3/2015,7/3/2016,5.186.933.517.456.050,5.204.796.600.341.790,0.194612,0.254011,...,GOOD,27.9,9.38,5.46,1.87,0.81,19.91,0.99,0.81,8.91


In [None]:



cols_to_clean_numeric = ['price_BUY', 'price_SELL', 'expected_return (yearly)', 'nominal_return',
                         'Volatility_Buy', 'Volatility_sell', 'ESG_ranking', 'PE_ratio', 'EPS_ratio',
                         'PS_ratio', 'PB_ratio', 'NetProfitMargin_ratio', 'current_ratio', 'roa_ratio', 'roe_ratio']

for col in cols_to_clean_numeric:
    if col in df.columns:
        df[col] = df[col].astype(str).str.replace('.', '', regex=False)
        df[col] = df[col].astype(str).str.replace(',', '.', regex=False)
        df[col] = pd.to_numeric(df[col], errors='coerce')

print("Tipos de datos después de la conversión:\n", df[cols_to_clean_numeric].dtypes)

X = df.drop(columns=['sector', 'company', 'date_BUY_fix', 'date_SELL_fix', 'investment'])
y = df['sector']


imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)


X_imputed_df = pd.DataFrame(X_imputed, columns=X.columns)



selector = SelectKBest(score_func=f_classif, k=5)
X_new = selector.fit_transform(X_imputed_df, y)


selected_features = X_imputed_df.columns[selector.get_support()]
print("Características seleccionadas:", selected_features)

Tipos de datos después de la conversión:
 price_BUY                     int64
price_SELL                    int64
expected_return (yearly)    float64
nominal_return              float64
Volatility_Buy                int64
Volatility_sell               int64
ESG_ranking                   int64
PE_ratio                      int64
EPS_ratio                     int64
PS_ratio                      int64
PB_ratio                      int64
NetProfitMargin_ratio         int64
current_ratio                 int64
roa_ratio                     int64
roe_ratio                     int64
dtype: object
Características seleccionadas: Index(['ESG_ranking', 'PS_ratio', 'NetProfitMargin_ratio', 'current_ratio',
       'roa_ratio'],
      dtype='object')


In [None]:

X_model = X_imputed_df[selected_features]


X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.3, random_state=42, stratify=y
)


model = RandomForestClassifier(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)


print("🔹 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔹 Matriz de confusión:\n", confusion_matrix(y_test, y_pred))
print("\n🔹 Reporte de clasificación:\n", classification_report(y_test, y_pred))

🔹 Accuracy: 0.9975

🔹 Matriz de confusión:
 [[83  0  0  0  0]
 [ 0 93  0  0  0]
 [ 0  1 50  0  0]
 [ 0  0  0 90  0]
 [ 0  0  0  0 83]]

🔹 Reporte de clasificación:
               precision    recall  f1-score   support

        AUTO       1.00      1.00      1.00        83
        BANK       0.99      1.00      0.99        93
        FMCG       1.00      0.98      0.99        51
      RETAIL       1.00      1.00      1.00        90
        TECH       1.00      1.00      1.00        83

    accuracy                           1.00       400
   macro avg       1.00      1.00      1.00       400
weighted avg       1.00      1.00      1.00       400

