In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Cleaning the data

In [None]:
df = pd.read_csv('Financial Distress.csv')
print(df.isnull().sum())
df = df.groupby('Company').mean(numeric_only=True).reset_index()
df = df.drop(columns=['Time'])
print(df.head(5))
print(len(df))

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

features_count = len(df.columns) - 1
X = df.drop(columns=['Financial Distress'], axis=1).to_numpy().reshape(-1, features_count)
y = df['Financial Distress'].to_numpy()
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
folds = KFold(n_splits=5, shuffle=True, random_state=1)

rmse_train_scores = []
r2_train_scores = []
y_cv_train_pred = np.zeros_like(y_trainval)
rmse_val_scores = []
r2_val_scores = []
y_cv_val_pred = np.zeros_like(y_trainval)
rf = RandomForestRegressor(n_estimators=200, max_features='log2', max_depth=5, min_samples_split=5, min_samples_leaf=2, random_state=1)
for train_index, val_index in folds.split(X_trainval):
    X_train, X_val = X_trainval[train_index], X_trainval[val_index]
    y_train, y_val = y_trainval[train_index], y_trainval[val_index]
    rf.fit(X_train, y_train)
    # training
    y_train_pred = rf.predict(X_train)
    y_cv_train_pred[train_index] = y_train_pred
    rmse_train_scores.append(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    r2_train_scores.append(r2_score(y_train, y_train_pred))
    # validation
    y_val_pred = rf.predict(X_val)
    y_cv_val_pred[val_index] = y_val_pred
    rmse_val_scores.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
    r2_val_scores.append(r2_score(y_val, y_val_pred))
print(f"Training error (RMSE): {np.mean(rmse_train_scores):.2f}")
print(f"Validation error (RMSE): {np.mean(rmse_val_scores):.2f}")
print(f"Training error (R2): {np.mean(r2_train_scores):.2f}")
print(f"Validation error (R2): {np.mean(r2_val_scores):.2f}")

rf.fit(X_trainval, y_trainval)
y_test_pred = rf.predict(X_test)
rf_mse_score = mean_squared_error(y_test, y_test_pred)
rf_rmse_score = np.sqrt(mean_squared_error(y_test, y_test_pred))
rf_r2_score = r2_score(y_test, y_test_pred)

print(f"Test error (MSE): {rf_mse_score:.2f}")
print(f"Test error (RMSE): {rf_rmse_score:.2f}")
print(f"Test error (R2): {rf_r2_score:.2f}")

### Vizualize tha data or sumn

In [None]:
# Plotting the distribution of Financial Distress
# Splitting values into bins with step of 0.5
bins = np.arange(min(df['Financial Distress']), max(df['Financial Distress']) + 0.5, 0.5)

# Show histogram
plt.figure(figsize=(8, 5))
plt.hist(df['Financial Distress'], bins=bins, color="darkred", edgecolor='black', alpha=0.7)
plt.title(f"Distribution of Financial Distress: x ∈ [{min(df['Financial Distress']):.2f}, {max(df['Financial Distress']):.2f}]")
plt.xlabel('Financial Distress')
plt.ylabel('Number of Companies')
plt.show()



