In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor



In [4]:
df = pd.read_csv("df_reg.csv")

In [5]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding,ROI
0,180000,1,9,2012,8000,172000
1,100000,13,44,2002,7000,93000
2,100000,6,9,2008,2000,98000
3,95000,8,44,2010,2000,93000
4,46000,8,38,2005,4000,42000
...,...,...,...,...,...,...
1057,1000,6,9,2012,379,621
1058,1000,6,9,2015,990,10
1059,1000,3,9,2018,80,920
1060,1000,8,43,2005,792,208


#### El problema que veo en estos datos es que tenemos muy pocas muestras y eso afecta al modelo futuro, podemos intentar solucionarlo creando categorias nuevas:


In [6]:
#años desde que se fundó
df['Years Since Founded'] = pd.Timestamp.now().year - df['Year Founded']

#interacción entre la industria y el país
df['Industry_Country'] = df['Industry'].astype(str) + "_" + df['Country'].astype(str)

# ratio de financiación y la edad de la empresa
df['Funding_Age_Ratio'] = df['Funding'] / df['Years Since Founded']

#interacción entre la industria y la financiación
df['Industry_Funding'] = df['Industry'].astype(str) + "_" + df['Funding'].astype(str)

In [7]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding,ROI,Years Since Founded,Industry_Country,Funding_Age_Ratio,Industry_Funding
0,180000,1,9,2012,8000,172000,11,1_9,727.272727,1_8000
1,100000,13,44,2002,7000,93000,21,13_44,333.333333,13_7000
2,100000,6,9,2008,2000,98000,15,6_9,133.333333,6_2000
3,95000,8,44,2010,2000,93000,13,8_44,153.846154,8_2000
4,46000,8,38,2005,4000,42000,18,8_38,222.222222,8_4000
...,...,...,...,...,...,...,...,...,...,...
1057,1000,6,9,2012,379,621,11,6_9,34.454545,6_379
1058,1000,6,9,2015,990,10,8,6_9,123.750000,6_990
1059,1000,3,9,2018,80,920,5,3_9,16.000000,3_80
1060,1000,8,43,2005,792,208,18,8_43,44.000000,8_792


In [8]:
num_rows_to_generate = 90000

min_values = df.min()
max_values = df.max()

new_rows = df.sample(n=num_rows_to_generate, replace=True)

for column in df.select_dtypes(include=[np.number]).columns:
    noise = np.random.normal(0, 1, new_rows[column].shape)
    new_rows[column] += noise
    new_rows[column] = np.clip(new_rows[column], min_values[column], max_values[column])

new_rows['Valuation'] = np.clip(new_rows['Valuation'], None, df['Valuation'].max())

df = pd.concat([df, new_rows], ignore_index=True)


In [9]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding,ROI,Years Since Founded,Industry_Country,Funding_Age_Ratio,Industry_Funding
0,180000.000000,1.000000,9.000000,2012.000000,8000.000000,172000.000000,11.000000,1_9,727.272727,1_8000
1,100000.000000,13.000000,44.000000,2002.000000,7000.000000,93000.000000,21.000000,13_44,333.333333,13_7000
2,100000.000000,6.000000,9.000000,2008.000000,2000.000000,98000.000000,15.000000,6_9,133.333333,6_2000
3,95000.000000,8.000000,44.000000,2010.000000,2000.000000,93000.000000,13.000000,8_44,153.846154,8_2000
4,46000.000000,8.000000,38.000000,2005.000000,4000.000000,42000.000000,18.000000,8_38,222.222222,8_4000
...,...,...,...,...,...,...,...,...,...,...
91057,1002.182614,8.460198,45.000000,2012.880596,161.766394,839.199318,12.608856,8_44,15.158421,8_161
91058,1000.291331,0.648173,43.640355,2016.072733,165.116176,833.091548,7.821003,1_44,22.486228,1_166
91059,3001.205771,13.170315,43.795953,2013.984388,330.557982,2669.516931,9.034992,11_44,35.653642,11_330
91060,4999.175066,5.549055,25.694317,2009.189789,562.591383,4437.991419,14.394341,6_25,38.962622,6_562


In [10]:
#Eliminamos outliers con ml

from sklearn.ensemble import IsolationForest


anomaly_detector = IsolationForest(contamination='auto', random_state=357)
anomaly_detector.fit(df.select_dtypes(include=[np.number]))

anomaly_mask = anomaly_detector.predict(df.select_dtypes(include=[np.number])) == -1
anomaly_indices = np.where(anomaly_mask)[0]
normal_indices = np.where(~anomaly_mask)[0]

df_clean = df.iloc[normal_indices, :]
df_anomaly = df.iloc[anomaly_indices, :]
df_anomaly["Is_anomaly"] = True

df_final = pd.concat([df_clean, df_anomaly], axis=0).sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly["Is_anomaly"] = True


### modelado

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

num_cols = ['Years Since Founded', 'Funding', 'Funding_Age_Ratio']
cat_cols = ['Industry', 'Country', 'Industry_Country', 'Industry_Funding']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

X = df_final.drop('Valuation', axis=1)
y = df_final['Valuation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [30, 50,100],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    #'min_samples_split': [2, 4, 6],
    #'min_samples_leaf': [1, 3, 5],
    #'subsample': [0.8, 0.9, 1.0]
}

model = GradientBoostingRegressor()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

grid_search.fit(X_train_preprocessed, y_train)

print(grid_search.best_params_)

y_pred = grid_search.predict(X_test_preprocessed)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R^2: {r2}')



In [None]:
#guardamos el modelo 
save_model( y_pred, 'ml_unicorns')