In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.preprocessing import StandardScaler

In [5]:
file = Path('../../data/cleandata/sentiment_cmc_track_merge.csv')
df = pd.read_csv(file)
df.head()

Unnamed: 0,rank_cnc,Name,Ticker,market_cap,price,volume24,pct1h,pct24h,pct7d,country,...,type,End,Start,Duration,pre_Duration,compound,negative,neutral,positive,no_of_posts
0,7,Binance Coin,BNB,3277372658,21.07,231846713,-0.46,-1.33,2.49,Japan,...,ICO,2017-07-21 07:00:00,2017-07-01 07:00:00,20 days 00:00:00.000000000,0 days 00:00:00.000000000,0.563773,0.09667,0.04141,0.86189,100
1,8,EOS,EOS,3214893394,3.42,1735988258,0.11,-1.49,-2.55,Cayman Islands,...,ICO,2018-06-12 00:00:00,2017-06-26 16:00:00,350 days 08:00:00.000000000,0 days 00:00:00.000000000,0.82847,0.0322,0.85855,0.10927,100
2,13,Cardano,ADA,1105888653,0.042654,53437224,-0.5,-1.72,-2.12,Switzerland,...,ICO,2017-01-01 00:00:00,2015-09-01 00:00:00,488 days 00:00:00.000000000,0 days 00:00:00.000000000,0.589474,0.098387,0.032946,0.868753,93
3,18,Tezos,XTZ,821746518,1.24,44390717,-0.25,10.13,2.47,United States of America,...,ICO,2017-07-14 12:00:00,2017-07-01 12:00:00,13 days 00:00:00.000000000,0 days 00:00:00.000000000,0.663939,0.021939,0.878857,0.099327,49
4,19,Cosmos,ATOM,760265147,3.99,147439544,-1.62,1.29,2.14,Canada,...,ICO,2017-04-06 13:00:00,2017-04-06 12:00:00,0 days 01:00:00.000000000,0 days 00:00:00.000000000,0.340496,0.060687,0.023224,0.916194,67


In [None]:
# Encoding the non-numerical columns
df = pd.get_dummies(df, columns=["col 1", "col 2", "col 3", "etc"])

In [None]:
# Save the Preprocessed File
encoded = Path('.csv')
df.to_csv(encoded, index=False)

In [None]:
# Create our features
X = df.drop(columns="y")

# Create our target
y = df["y"].values.reshape(-1, 1)

In [None]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Scale the Features Data
scaler = StandardScaler()

X_scaler = scaler.fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Balanced Random Forest Classifier
brf = BalancedRandomForestClassifier(n_estimators=100, random_state=1)

brf_model = brf.fit(X_train_scaled, y_train)

In [None]:
y_pred_brf = brf.predict(X_test_scaled)

In [None]:
# Calculated the balanced accuracy score
bac_brf = balanced_accuracy_score(y_test, y_pred_brf)

print(f"The Balanced Accuracy Score for the Balanced Random Forest Classifier is: {bac_brf}")

In [None]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred_brf)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred_brf))

In [None]:
# List the features sorted in descending order by feature importance
importances_sorted = sorted(zip(brf.feature_importances_, X.columns), reverse=True)
importances_sorted