# **High Inflation**

In [None]:
import pandas as pd

excel_file_path = "/content/sample_data/CW4_High Inflation - Melvin & Dhita.xlsx"
df = pd.read_excel(excel_file_path)

print("Heading of the data:")
display(df.head())

Heading of the data:


Unnamed: 0,id,Profitability,Probability,New Profitability,debt,RnD,Firm Age,Firm Size,CEO Experience,foreign sales,CEO education,CEO Gender,Marketing,Product Sentimet,Politician,PoB
0,1,0.02,0.579913,0.011598,23.76,53.09,3.43,21.22,2,9.16,2,0,0.43408,0.443087,0.141194,0
1,2,3.99,0.122876,0.490274,41.85,45.3,3.22,21.83,4,41.42,1,0,0.74006,0.860709,0.109703,0
2,3,41.98,0.514616,21.60356,61.84,51.16,3.18,20.54,3,5.88,3,1,0.194094,0.306597,0.388709,0
3,4,8.48,0.004706,0.039907,25.61,115.49,3.26,20.59,3,11.08,2,0,0.004992,0.400907,0.221843,0
4,5,-1.94,0.461866,-0.896021,74.24,13.57,3.26,18.73,3,24.61,1,0,0.381754,0.797531,0.515557,0


# Training the Model (Decision Tree Regression)

In [None]:
print(df.columns)

Index(['id', 'Profitability', 'Probability', 'New Profitability', 'debt',
       'RnD', 'Firm Age', 'Firm Size', 'CEO Experience', 'foreign sales',
       'CEO education', 'CEO Gender', 'Marketing', 'Product Sentimet',
       'Politician', 'PoB'],
      dtype='object')


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#Prepare the data
features = ['debt', 'RnD', 'Firm Age', 'Firm Size', 'CEO Experience', 'foreign sales', 'CEO education', 'CEO Gender', 'Marketing', 'Product Sentimet', 'PoB']
target = 'New Profitability'

X = df[features]
y = df[target]

print("Features (X) heading:")
display(X.head())

print("\nTarget (Y) heading:")
display(Y.head())

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("\nTraining data shapes:")
print("X_train shape:", X_train.shape)
print("Y_train shape:", y_train.shape)

print("\nTesting data shapes:")
print("X_test shape:", X_test.shape)
print("Y_test shape:", y_test.shape)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("\nRandom Forest model trained successfully.")

Features (X) heading:


Unnamed: 0,debt,RnD,Firm Age,Firm Size,CEO Experience,foreign sales,CEO education,CEO Gender,Marketing,Product Sentimet,PoB
0,23.76,53.09,3.43,21.22,2,9.16,2,0,0.43408,0.443087,0
1,41.85,45.3,3.22,21.83,4,41.42,1,0,0.74006,0.860709,0
2,61.84,51.16,3.18,20.54,3,5.88,3,1,0.194094,0.306597,0
3,25.61,115.49,3.26,20.59,3,11.08,2,0,0.004992,0.400907,0
4,74.24,13.57,3.26,18.73,3,24.61,1,0,0.381754,0.797531,0



Target (Y) heading:


Unnamed: 0,New Profitability
0,0.011598
1,0.490274
2,21.60356
3,0.039907
4,-0.896021



Training data shapes:
X_train shape: (97, 11)
Y_train shape: (97,)

Testing data shapes:
X_test shape: (25, 11)
Y_test shape: (25,)

Random Forest model trained successfully.


# Calculating MSE & R^2

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predict on the test data
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"R-squared (R2): {r2:.2f}")

Mean Squared Error (MSE): 25.13
R-squared (R2): -1.16


# Feature Importance

In [None]:
# Get feature importances from the trained model
feature_importances = model.feature_importances_

# Get the names of the features
feature_names = X.columns

# Create a pandas Series for better visualization
feature_importance_series = pd.Series(feature_importances, index=feature_names)

# Sort the features by importance in descending order
sorted_feature_importances = feature_importance_series.sort_values(ascending=False)

print("Feature Importance Scores:")
display(sorted_feature_importances)

Feature Importance Scores:


Unnamed: 0,0
Firm Size,0.240645
CEO education,0.164822
CEO Experience,0.106961
debt,0.089095
RnD,0.083169
Product Sentimet,0.07682
Firm Age,0.067976
foreign sales,0.065573
Marketing,0.053042
CEO Gender,0.026929


# LASSO Feature Importance (if 0, remove it)

In [None]:
from sklearn.linear_model import Lasso
import pandas as pd

# Initialize and train the LASSO model
# We use a relatively small alpha (regularization strength) here;
# you might want to experiment with different alpha values.
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X, Y)

# Get the coefficients
lasso_coefficients = lasso.coef_

# Get the names of the features
feature_names = X.columns

# Create a pandas Series for better visualization
lasso_feature_importance = pd.Series(lasso_coefficients, index=feature_names)

# Sort the features by the absolute value of their coefficients in descending order
sorted_lasso_importance = lasso_feature_importance.abs().sort_values(ascending=False)

print("LASSO Coefficients (sorted by absolute value):")
display(lasso_feature_importance[sorted_lasso_importance.index])

LASSO Coefficients (sorted by absolute value):


Unnamed: 0,0
CEO Gender,2.818236
CEO Experience,1.158033
CEO education,0.794095
Marketing,0.715789
Firm Size,0.452606
PoB,-0.221153
debt,-0.044131
foreign sales,-0.015469
RnD,-0.000349
Firm Age,0.0


# OLS Regression (if P>|z| under 0.1, keep it)

In [None]:
import statsmodels.api as sm

# Add a constant (intercept) to the features
X_with_intercept = sm.add_constant(X)

# Initialize and train the OLS model
# We use the 'hc1' option for robust standard errors (White's standard errors)
ols_model = sm.OLS(Y, X_with_intercept)
results = ols_model.fit(cov_type='hc1')

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      New Profitability   R-squared:                       0.198
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     1.736
Date:                Tue, 18 Nov 2025   Prob (F-statistic):             0.0745
Time:                        08:57:18   Log-Likelihood:                -367.02
No. Observations:                 122   AIC:                             758.0
Df Residuals:                     110   BIC:                             791.7
Df Model:                          11                                         
Covariance Type:                  hc1                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -12.8220      8.637  

# PCA (If larger than 0.7, keep it)


In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# Define the features (X)
X = df[features]

# Initialize and fit PCA
# You can specify the number of components, or leave it as None to keep all components
pca = PCA()
pca.fit(X)

# Get the loading factors (components)
# The loading factors are the eigenvectors of the covariance matrix
loading_factors = pca.components_

# Create a DataFrame for better visualization of loading factors
loading_factors_df = pd.DataFrame(loading_factors, columns=X.columns, index=[f'PC{i+1}' for i in range(loading_factors.shape[0])])

print("Principal Component Analysis Loading Factors:")
print(loading_factors_df)

Principal Component Analysis Loading Factors:
          debt           RnD  Firm Age  Firm Size  CEO Experience  \
PC1  -0.002216  9.931249e-01  0.000434   0.000475        0.001620   
PC2  -0.007652  1.169902e-01 -0.002204  -0.001811        0.002525   
PC3   0.999930  3.104558e-03 -0.001244  -0.004375       -0.003073   
PC4   0.004327 -4.719162e-04  0.056307   0.991646        0.067326   
PC5  -0.001447  1.748275e-05 -0.045801   0.008410       -0.617958   
PC6   0.002931 -2.048793e-03  0.565149  -0.097386        0.620285   
PC7   0.003997  3.748678e-04  0.316198   0.066229       -0.182277   
PC8  -0.003317  1.214357e-03  0.743437  -0.024448       -0.404124   
PC9   0.004456  4.950239e-04  0.070842   0.012392       -0.079751   
PC10 -0.000674  4.662870e-04 -0.087509   0.043784        0.037502   
PC11 -0.000893  7.996973e-07 -0.099880   0.002356        0.156576   

      foreign sales  CEO education  CEO Gender  Marketing  Product Sentimet  \
PC1       -0.117011       0.001569    0.000619

# **Good Economy**

In [None]:
import pandas as pd

excel_file_path = "/content/sample_data/CW4_Good Economy - Melvin & Dhita.xlsx"
df = pd.read_excel(excel_file_path)

print("Heading of the data:")
display(df.head())

Heading of the data:


Unnamed: 0,id,Profitability,Probability,New Profitability,debt,RnD,Firm Age,Firm Size,CEO Experience,foreign sales,CEO education,CEO Gender,Marketing,Product Sentimet,Politician,PoB
0,1,1.638487,0.579913,0.95018,23.76,53.09,3.43,21.22,2,9.16,2,0,0.43408,0.443087,0.141194,0
1,2,1.053044,0.122876,0.129394,41.85,45.3,3.22,21.83,4,41.42,1,0,0.74006,0.860709,0.109703,0
2,3,1.049475,0.514616,0.540076,61.84,51.16,3.18,20.54,3,5.88,3,1,0.194094,0.306597,0.388709,0
3,4,1.999807,0.004706,0.009411,25.61,115.49,3.26,20.59,3,11.08,2,0,0.004992,0.400907,0.221843,0
4,5,1.382063,0.461866,0.638328,74.24,13.57,3.26,18.73,3,24.61,1,0,0.381754,0.797531,0.515557,0


Training the Model (Decision Tree Regression)

In [None]:
print(df.columns)

Index(['id', 'Profitability', 'Probability', 'New Profitability', 'debt',
       'RnD', 'Firm Age', 'Firm Size', 'CEO Experience', 'foreign sales',
       'CEO education', 'CEO Gender', 'Marketing', 'Product Sentimet',
       'Politician', 'PoB'],
      dtype='object')


In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

#Prepare the data
features = ['debt', 'RnD', 'Firm Age', 'Firm Size', 'CEO Experience', 'foreign sales', 'CEO education', 'CEO Gender', 'Marketing', 'Product Sentimet', 'PoB']
target = 'New Profitability'

X = df[features]
y = df[target]

print("Features (X) heading:")
display(X.head())

print("\nTarget (y) heading:")
display(y.head())

# Split the data into training and testing sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining data shapes:")
print("X_train shape:", X_train.shape)
print("Y_train shape:", y_train.shape)

print("\nTesting data shapes:")
print("X_test shape:", X_test.shape)
print("Y_test shape:", y_test.shape)

# Train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("\nRandom Forest model trained successfully.")

Features (X) heading:


Unnamed: 0,debt,RnD,Firm Age,Firm Size,CEO Experience,foreign sales,CEO education,CEO Gender,Marketing,Product Sentimet,PoB
0,23.76,53.09,3.43,21.22,2,9.16,2,0,0.43408,0.443087,0
1,41.85,45.3,3.22,21.83,4,41.42,1,0,0.74006,0.860709,0
2,61.84,51.16,3.18,20.54,3,5.88,3,1,0.194094,0.306597,0
3,25.61,115.49,3.26,20.59,3,11.08,2,0,0.004992,0.400907,0
4,74.24,13.57,3.26,18.73,3,24.61,1,0,0.381754,0.797531,0



Target (y) heading:


Unnamed: 0,New Profitability
0,0.95018
1,0.129394
2,0.540076
3,0.009411
4,0.638328



Training data shapes:
X_train shape: (97, 11)
Y_train shape: (97,)

Testing data shapes:
X_test shape: (25, 11)
Y_test shape: (25,)

Random Forest model trained successfully.


# Calculating MSE & R^2

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions on the test set
Y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse:.2f}")

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2) score: {r2:.2f}")

Mean Squared Error (MSE): 10.47
R-squared (R2) score: -63.96


# Feature Importance

In [None]:
# Get feature importance from the trained model
feature_importances = model.feature_importances_

# Get the names of the features
feature_names = X.columns

# Create a pandas Series for better visualization
feature_importance_series = pd.Series(feature_importances, index=feature_names)

# Sort the features by importance in descending order
sorted_feature_importance = feature_importance_series.sort_values(ascending=False)

print("Feature Importance Scores:")
display(sorted_feature_importance)

Feature Importance Scores:


Unnamed: 0,0
debt,0.183677
Firm Size,0.154843
RnD,0.13113
Marketing,0.128193
foreign sales,0.115457
Product Sentimet,0.108818
Firm Age,0.106167
PoB,0.021193
CEO education,0.020279
CEO Experience,0.015153


# LASSO Feature Importance (if 0, remove it)

In [None]:
from sklearn.linear_model import Lasso
import pandas as pd

# Initialize and train the LASSO model
# We use a relatively small alpha (regularization strength) here;
# you might want to experiment with different alpha values.
lasso = Lasso(alpha=0.1, random_state=42)
lasso.fit(X, Y)

# Get the coefficients
lasso_coefficients = lasso.coef_

# Get the names of the features
feature_names = X.columns

# Create a pandas Series for better visualization
lasso_feature_importance = pd.Series(lasso_coefficients, index=feature_names)

# Sort the features by the absolute value of their coefficients in descending order
sorted_lasso_importance = lasso_feature_importance.abs().sort_values(ascending=False)

print("LASSO Coefficients (sorted by absolute value):")
display(lasso_feature_importance[sorted_lasso_importance.index])

LASSO Coefficients (sorted by absolute value):


Unnamed: 0,0
CEO Gender,2.818236
CEO Experience,1.158033
CEO education,0.794095
Marketing,0.715789
Firm Size,0.452606
PoB,-0.221153
debt,-0.044131
foreign sales,-0.015469
RnD,-0.000349
Firm Age,0.0


# OLS Regression (if P>|z| under 0.1, keep it)

In [None]:
import statsmodels.api as sm

# Add a constant (intercept) to the features
X_with_intercept = sm.add_constant(X)

# Initialize and train the OLS model
# We use the 'hc1' option for robust standard errors (White's standard errors)
ols_model = sm.OLS(Y, X_with_intercept)
results = ols_model.fit(cov_type='hc1')

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:      New Profitability   R-squared:                       0.198
Model:                            OLS   Adj. R-squared:                  0.118
Method:                 Least Squares   F-statistic:                     1.736
Date:                Tue, 18 Nov 2025   Prob (F-statistic):             0.0745
Time:                        08:57:19   Log-Likelihood:                -367.02
No. Observations:                 122   AIC:                             758.0
Df Residuals:                     110   BIC:                             791.7
Df Model:                          11                                         
Covariance Type:                  hc1                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const              -12.8220      8.637  

# PCA (If larger than 0.7, keep it)



In [None]:
from sklearn.decomposition import PCA
import pandas as pd

# Define the features (X)
X = df[features]

# Initialize and fit PCA
# You can specify the number of components, or leave it as None to keep all components
pca = PCA()
pca.fit(X)

# Get the loading factors (components)
# The loading factors are the eigenvectors of the covariance matrix
loading_factors = pca.components_

# Create a DataFrame for better visualization of loading factors
loading_factors_df = pd.DataFrame(loading_factors, columns=X.columns, index=[f'PC{i+1}' for i in range(loading_factors.shape[0])])

print("Principal Component Analysis Loading Factors:")
print(loading_factors_df)

Principal Component Analysis Loading Factors:
          debt           RnD  Firm Age  Firm Size  CEO Experience  \
PC1  -0.002216  9.931249e-01  0.000434   0.000475        0.001620   
PC2  -0.007652  1.169902e-01 -0.002204  -0.001811        0.002525   
PC3   0.999930  3.104558e-03 -0.001244  -0.004375       -0.003073   
PC4   0.004327 -4.719162e-04  0.056307   0.991646        0.067326   
PC5  -0.001447  1.748275e-05 -0.045801   0.008410       -0.617958   
PC6   0.002931 -2.048793e-03  0.565149  -0.097386        0.620285   
PC7   0.003997  3.748678e-04  0.316198   0.066229       -0.182277   
PC8  -0.003317  1.214357e-03  0.743437  -0.024448       -0.404124   
PC9   0.004456  4.950239e-04  0.070842   0.012392       -0.079751   
PC10 -0.000674  4.662870e-04 -0.087509   0.043784        0.037502   
PC11 -0.000893  7.996973e-07 -0.099880   0.002356        0.156576   

      foreign sales  CEO education  CEO Gender  Marketing  Product Sentimet  \
PC1       -0.117011       0.001569    0.000619