In [1]:
#Importing our dependencies

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [2]:
# Loading data into a pandas DataFrame 

data = pd.read_csv('../CSV_Output/ML_dataset.csv')
data

Unnamed: 0,industry,wr1,wr2,wr3,wr4,vol1,vol2,vol3,vol4,pe_ratio,...,profit_margin,free_cash_margin,volatility,cpi,interest_rate,unemployment_rate,sma,rsi,ema,label
0,Consumer Discretionary,-0.019375,0.042774,-0.009555,-0.007692,6230100.0,6996900.0,6377700.0,7026700.0,30.01,...,14.40,13.74,0.025251,241.428,0.40,5.0,56.240000,8.149406,56.300000,0
1,Utilities,0.025644,0.011924,-0.006826,0.031470,175800.0,195300.0,233900.0,221400.0,17.58,...,11.15,1.49,0.013118,238.132,0.36,5.0,73.803333,87.796610,73.714583,0
2,Real Estate,0.057182,-0.004482,-0.040843,0.036994,1298900.0,3493600.0,571100.0,832500.0,62.13,...,12.97,29.98,0.019104,237.838,0.13,5.0,23.196667,88.260870,23.240833,0
3,Industrials,-0.044857,0.062964,0.055392,-0.058078,1071300.0,880100.0,1080400.0,994800.0,6.68,...,27.43,-18.14,0.023650,305.691,5.08,3.5,235.736667,11.037986,235.240833,0
4,Consumer Staples,0.022806,-0.021995,0.021842,-0.028000,6563300.0,6745100.0,9474700.0,10172500.0,10.28,...,4.55,10.21,0.019597,259.101,0.10,10.2,63.633333,72.448980,63.837083,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1909,Consumer Discretionary,0.011118,0.019845,-0.060006,-0.000558,1252700.0,1125100.0,1055100.0,1414500.0,21.36,...,3.96,3.60,0.017376,243.603,0.66,4.6,34.243333,85.278276,34.327500,1
1910,Health Care,0.000624,0.003380,0.026196,0.020623,1086600.0,1062100.0,959900.0,700900.0,9.51,...,17.24,23.71,0.019483,267.054,0.07,6.1,127.980000,54.818523,128.192500,0
1911,Consumer Discretionary,-0.006360,0.040154,-0.038746,-0.026070,894900.0,935100.0,490500.0,490500.0,9.66,...,3.87,9.40,0.016327,251.712,2.40,4.0,66.236667,56.290439,66.580000,1
1912,Industrials,-0.005385,0.010020,0.101545,-0.066394,4106200.0,3906300.0,12259700.0,12259700.0,5.44,...,12.06,-4.79,0.027206,305.109,5.07,3.6,35.120000,43.270869,35.242500,0


In [3]:
#Doing One hot encoding for the 'Industry' columns

data_encoded = pd.get_dummies(data, columns=['industry'])
data_encoded = data_encoded.dropna()
data_encoded

Unnamed: 0,wr1,wr2,wr3,wr4,vol1,vol2,vol3,vol4,pe_ratio,debt_to_equity,...,industry_Consumer Staples,industry_Energy,industry_Finance,industry_Health Care,industry_Industrials,industry_Miscellaneous,industry_Real Estate,industry_Technology,industry_Telecommunications,industry_Utilities
0,-0.019375,0.042774,-0.009555,-0.007692,6230100.0,6996900.0,6377700.0,7026700.0,30.01,0.63,...,0,0,0,0,0,0,0,0,0,0
1,0.025644,0.011924,-0.006826,0.031470,175800.0,195300.0,233900.0,221400.0,17.58,0.85,...,0,0,0,0,0,0,0,0,0,1
2,0.057182,-0.004482,-0.040843,0.036994,1298900.0,3493600.0,571100.0,832500.0,62.13,0.60,...,0,0,0,0,0,0,1,0,0,0
3,-0.044857,0.062964,0.055392,-0.058078,1071300.0,880100.0,1080400.0,994800.0,6.68,0.36,...,0,0,0,0,1,0,0,0,0,0
4,0.022806,-0.021995,0.021842,-0.028000,6563300.0,6745100.0,9474700.0,10172500.0,10.28,1.35,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1909,0.011118,0.019845,-0.060006,-0.000558,1252700.0,1125100.0,1055100.0,1414500.0,21.36,2.75,...,0,0,0,0,0,0,0,0,0,0
1910,0.000624,0.003380,0.026196,0.020623,1086600.0,1062100.0,959900.0,700900.0,9.51,0.69,...,0,0,0,1,0,0,0,0,0,0
1911,-0.006360,0.040154,-0.038746,-0.026070,894900.0,935100.0,490500.0,490500.0,9.66,0.45,...,0,0,0,0,0,0,0,0,0,0
1912,-0.005385,0.010020,0.101545,-0.066394,4106200.0,3906300.0,12259700.0,12259700.0,5.44,0.36,...,0,0,0,0,1,0,0,0,0,0


In [4]:
#Setting the X and Y variables

y = data_encoded["label"]
target_names = ["increase", "decrease"]
X = data_encoded.drop('label', axis=1)

In [5]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [7]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a Gradient Boosting Classifier model
GBT_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)


In [10]:
# Fit the model to the training data
GBT_model.fit(X_train_scaled, y_train)

In [11]:
# Make predictions on the test set
y_pred = GBT_model.predict(X_test_scaled)

In [13]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.48


In [15]:
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.45      0.40      0.42       226
           1       0.51      0.56      0.53       253

    accuracy                           0.48       479
   macro avg       0.48      0.48      0.48       479
weighted avg       0.48      0.48      0.48       479

