In [2]:
import pandas as pd
import numpy as np

### Cleaning the data

In [3]:
df = pd.read_csv('Financial Distress.csv')
print(df.isnull().sum())
df = df.groupby('Company').mean(numeric_only=True).reset_index()
df = df.drop(columns=['Time'])
print(df.head(5))
print(len(df))

Company               0
Time                  0
Financial Distress    0
x1                    0
x2                    0
                     ..
x79                   0
x80                   0
x81                   0
x82                   0
x83                   0
Length: 86, dtype: int64
   Company  Financial Distress        x1        x2        x3        x4  \
0        1           -0.334323  1.179250 -0.011305  0.869128  0.940075   
1        2            1.966056  1.539892  0.204816  0.628511  0.931229   
2        3           -1.659900  0.874400 -0.034676  0.793500  0.609520   
3        4            0.839656  1.553275  0.138410  0.462178  0.759583   
4        5            1.969673  1.127500  0.107643  0.743549  0.449420   

         x5        x6        x7        x8  ...        x74      x75        x76  \
0  0.035843  0.126302  0.564090 -0.018738  ...  92.050750  33.5625  32.486500   
1  0.302304  0.251645  1.068073  0.218296  ...  86.854643  92.1600  89.237286   
2 -0.002632 -0.086847  

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

features_count = len(df.columns) - 1
X = df.drop(columns=['Financial Distress'], axis=1).to_numpy().reshape(-1, features_count)
y = df['Financial Distress'].to_numpy()
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
folds = KFold(n_splits=5, shuffle=True, random_state=1)

rmse_train_scores = []
r2_train_scores = []
y_cv_train_pred = np.zeros_like(y_trainval)
rmse_val_scores = []
r2_val_scores = []
y_cv_val_pred = np.zeros_like(y_trainval)
rf = RandomForestRegressor(n_estimators=200, max_features='log2', max_depth=5, min_samples_split=5, min_samples_leaf=2, random_state=1)
for train_index, val_index in folds.split(X_trainval):
    X_train, X_val = X_trainval[train_index], X_trainval[val_index]
    y_train, y_val = y_trainval[train_index], y_trainval[val_index]
    rf.fit(X_train, y_train)
    # training
    y_train_pred = rf.predict(X_train)
    y_cv_train_pred[train_index] = y_train_pred
    rmse_train_scores.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    r2_train_scores.append(r2_score(y_train, y_train_pred))
    # validation
    y_val_pred = rf.predict(X_val)
    y_cv_val_pred[val_index] = y_val_pred
    rmse_val_scores.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    r2_val_scores.append(r2_score(y_val, y_val_pred))
print(f"Training error (RMSE): {np.mean(rmse_train_scores):.2f}")
print(f"Validation error (RMSE): {np.mean(rmse_val_scores):.2f}")
print(f"Training error (R2): {np.mean(r2_train_scores):.2f}")
print(f"Validation error (R2): {np.mean(r2_val_scores):.2f}")

rf.fit(X_trainval, y_trainval)
y_test_pred = rf.predict(X_test)
rf_mse_score = mean_squared_error(y_test, y_test_pred)
rf_rmse_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
rf_r2_score = r2_score(y_test, y_test_pred)

print(f"Test error (MSE): {rf_mse_score:.2f}")
print(f"Test error (RMSE): {rf_rmse_score:.2f}")
print(f"Test error (R2): {rf_r2_score:.2f}")

Training error (RMSE): 1.19
Validation error (RMSE): 1.38
Training error (R2): 0.70
Validation error (R2): 0.53
Test error (MSE): 0.40
Test error (RMSE): 0.63
Test error (R2): 0.68


### Vizualize tha data or sumn