In [59]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


data = pd.read_csv('datasets2.csv')

label_encoder = LabelEncoder()
data['industry'] = label_encoder.fit_transform(data['industry'])
data['field'] = label_encoder.fit_transform(data['field'])


In [60]:
X = data[['industry', 'field', 'year', 'quarter']]
y = data['growth_rate']


In [61]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [62]:
from sklearn.ensemble import RandomForestRegressor


model = RandomForestRegressor(n_estimators=300, random_state=42)


model.fit(X_train, y_train)


In [63]:
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(X_test)

# Calculate the error and accuracy
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")


Mean Squared Error: 26.141715020846206
R2 Score: 0.5730129571060693


In [64]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [200, 300, 500],       # Number of trees in the forest
    'max_depth': [10, 20, 30, None],       # Maximum depth of the tree
    'min_samples_split': [2, 5, 10]        # Minimum number of samples required to split a node
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters found:", grid_search.best_params_)
print("Best R2 score from tuning:", grid_search.best_score_)

# Get the best model from grid search
best_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred_tuned = best_model.predict(X_test)

# Evaluate the tuned model
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
r2_tuned = r2_score(y_test, y_pred_tuned)

print(f"Tuned Mean Squared Error: {mse_tuned}")
print(f"Tuned R2 Score: {r2_tuned}")


Best parameters found: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 200}
Best R2 score from tuning: 0.3364764806819035
Tuned Mean Squared Error: 25.543549899835178
Tuned R2 Score: 0.5827831177852412
