Cell 1: Install CatBoost

In [None]:
!pip install catboost pandas scikit-learn joblib

Cell 2: Load the dataset

In [None]:
import pandas as pd

df = pd.read_csv('/content/merged_sentiment_price.csv')
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date')

# Create label: next-day up/down
df['next_close'] = df['close'].shift(-1)
df['target'] = (df['next_close'] > df['close']).astype(int)

# Add simulated categorical feature if none exist
df['sector'] = 'Tech'  # Replace with real sector if available
df['day_of_week'] = df['date'].dt.dayofweek

# Drop NaNs
df['price_change'] = df['close'].pct_change()
df['sentiment_rolling_mean'] = df['sentiment'].rolling(3).mean()
df = df.dropna().reset_index(drop=True)

df.head()


Cell 3: Prepare features and labels

In [None]:
feature_cols = ['sentiment', 'sentiment_rolling_mean', 'price_change', 'day_of_week', 'sector']
categorical_features = ['day_of_week', 'sector']  # Categorical columns

X = df[feature_cols]
y = df['target']


Cell 4: Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)

print("Train size:", len(X_train), "Test size:", len(X_test))


Cell 5: Train CatBoost Model

In [None]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
    iterations=200,
    depth=6,
    learning_rate=0.05,
    loss_function='Logloss',
    cat_features=categorical_features,
    verbose=50
)

model.fit(X_train, y_train, eval_set=(X_test, y_test))


Cell 6: Evaluate

In [None]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


Cell 7: Save the model

In [None]:
import joblib

joblib.dump(model, 'catboost_model.pkl')
print("✅ Model saved as catboost_model.pkl")


Cell 8: Download for Local Use

In [None]:
from google.colab import files
files.download("catboost_model.pkl")
