In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor



In [2]:
df = pd.read_csv("df_reg.csv")

In [3]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding
0,180000,Artificial intelligence,China,2012,8000
1,100000,Other,United States,2002,7000
2,100000,E-commerce & direct-to-consumer,China,2008,2000
3,95000,Fintech,United States,2010,2000
4,46000,Fintech,Sweden,2005,4000
...,...,...,...,...,...
1057,1000,E-commerce & direct-to-consumer,China,2012,379
1058,1000,E-commerce & direct-to-consumer,China,2015,990
1059,1000,Consumer & retail,China,2018,80
1060,1000,Fintech,United Kingdom,2005,792


#### El problema que veo en estos datos es que tenemos muy pocas muestras y eso afecta al modelo futuro, podemos intentar solucionarlo creando categorias nuevas:


In [4]:
#años desde que se fundó
df['Years Since Founded'] = pd.Timestamp.now().year - df['Year Founded']

#interacción entre la industria y el país
df['Industry_Country'] = df['Industry'].astype(str) + "_" + df['Country'].astype(str)

# ratio de financiación y la edad de la empresa
df['Funding_Age_Ratio'] = df['Funding'] / df['Years Since Founded']

#interacción entre la industria y la financiación
df['Industry_Funding'] = df['Industry'].astype(str) + "_" + df['Funding'].astype(str)

In [5]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding,Years Since Founded,Industry_Country,Funding_Age_Ratio,Industry_Funding
0,180000,Artificial intelligence,China,2012,8000,11,Artificial intelligence_China,727.272727,Artificial intelligence_8000
1,100000,Other,United States,2002,7000,21,Other_United States,333.333333,Other_7000
2,100000,E-commerce & direct-to-consumer,China,2008,2000,15,E-commerce & direct-to-consumer_China,133.333333,E-commerce & direct-to-consumer_2000
3,95000,Fintech,United States,2010,2000,13,Fintech_United States,153.846154,Fintech_2000
4,46000,Fintech,Sweden,2005,4000,18,Fintech_Sweden,222.222222,Fintech_4000
...,...,...,...,...,...,...,...,...,...
1057,1000,E-commerce & direct-to-consumer,China,2012,379,11,E-commerce & direct-to-consumer_China,34.454545,E-commerce & direct-to-consumer_379
1058,1000,E-commerce & direct-to-consumer,China,2015,990,8,E-commerce & direct-to-consumer_China,123.750000,E-commerce & direct-to-consumer_990
1059,1000,Consumer & retail,China,2018,80,5,Consumer & retail_China,16.000000,Consumer & retail_80
1060,1000,Fintech,United Kingdom,2005,792,18,Fintech_United Kingdom,44.000000,Fintech_792


In [6]:
num_rows_to_generate = 90000

min_values = df.min()
max_values = df.max()

new_rows = df.sample(n=num_rows_to_generate, replace=True)

for column in df.select_dtypes(include=[np.number]).columns:
    noise = np.random.normal(0, 1, new_rows[column].shape)
    new_rows[column] += noise
    new_rows[column] = np.clip(new_rows[column], min_values[column], max_values[column])

new_rows['Valuation'] = np.clip(new_rows['Valuation'], None, df['Valuation'].max())

df = pd.concat([df, new_rows], ignore_index=True)


In [7]:
df

Unnamed: 0,Valuation,Industry,Country,Year Founded,Funding,Years Since Founded,Industry_Country,Funding_Age_Ratio,Industry_Funding
0,180000.000000,Artificial intelligence,China,2012.000000,8000.000000,11.000000,Artificial intelligence_China,727.272727,Artificial intelligence_8000
1,100000.000000,Other,United States,2002.000000,7000.000000,21.000000,Other_United States,333.333333,Other_7000
2,100000.000000,E-commerce & direct-to-consumer,China,2008.000000,2000.000000,15.000000,E-commerce & direct-to-consumer_China,133.333333,E-commerce & direct-to-consumer_2000
3,95000.000000,Fintech,United States,2010.000000,2000.000000,13.000000,Fintech_United States,153.846154,Fintech_2000
4,46000.000000,Fintech,Sweden,2005.000000,4000.000000,18.000000,Fintech_Sweden,222.222222,Fintech_4000
...,...,...,...,...,...,...,...,...,...
91057,4999.073239,Internet software & services,India,2005.642756,869.947836,16.271375,Internet software & services_India,54.452253,Internet software & services_869
91058,2000.380945,Fintech,United States,2020.717753,101.061438,3.210535,Fintech_United States,48.079120,Fintech_100
91059,1000.000000,Fintech,Switzerland,1998.366200,104.697307,24.554261,Fintech_Switzerland,4.291826,Fintech_105
91060,3999.822232,Health,Germany,1921.819723,0.000000,103.411864,Health_Germany,0.256804,Health_0


In [8]:
#Eliminamos outliers con ml

from sklearn.ensemble import IsolationForest


anomaly_detector = IsolationForest(contamination='auto', random_state=357)
anomaly_detector.fit(df.select_dtypes(include=[np.number]))

anomaly_mask = anomaly_detector.predict(df.select_dtypes(include=[np.number])) == -1
anomaly_indices = np.where(anomaly_mask)[0]
normal_indices = np.where(~anomaly_mask)[0]

df_clean = df.iloc[normal_indices, :]
df_anomaly = df.iloc[anomaly_indices, :]
df_anomaly["Is_anomaly"] = True

df_final = pd.concat([df_clean, df_anomaly], axis=0).sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_anomaly["Is_anomaly"] = True


### modelado

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

num_cols = ['Years Since Founded', 'Funding', 'Funding_Age_Ratio']
cat_cols = ['Industry', 'Country', 'Industry_Country', 'Industry_Funding']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)])

X = df_final.drop('Valuation', axis=1)
y = df_final['Valuation']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [30, 50,100],
    'learning_rate': [0.01, 0.1, 1],
    'max_depth': [3, 5, 7],
    #'min_samples_split': [2, 4, 6],
    #'min_samples_leaf': [1, 3, 5],
    #'subsample': [0.8, 0.9, 1.0]
}

model = GradientBoostingRegressor()

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1)

grid_search.fit(X_train_preprocessed, y_train)

print(grid_search.best_params_)

y_pred = grid_search.predict(X_test_preprocessed)

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R^2: {r2}')



{'learning_rate': 1, 'max_depth': 7, 'n_estimators': 100}
MSE: 163197.91899174277
MAE: 87.57261476615876
R^2: 0.9976065688887485
