In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
import xgboost as xgb
from sklearn.decomposition import TruncatedSVD


In [2]:
news_df = pd.read_csv('../data/Combined_News_DJIA.csv',encoding='ISO-8859-1')
dow_df = pd.read_csv('../data/upload_DJIA_table.csv')

In [3]:
news_df['Date'] = pd.to_datetime(news_df['Date'])
dow_df['Date'] = pd.to_datetime(dow_df['Date'])

In [4]:
dow_df = dow_df.sort_values(by='Date')

dow_df['Percent_Change'] = dow_df['Close'].pct_change()
dow_df['Percent_Change'] = dow_df['Percent_Change'].fillna(0)
print(dow_df[['Date', 'Close', 'Percent_Change']].head())

           Date         Close  Percent_Change
1988 2008-08-08  11734.320312        0.000000
1987 2008-08-11  11782.349609        0.004093
1986 2008-08-12  11642.469727       -0.011872
1985 2008-08-13  11532.959961       -0.009406
1984 2008-08-14  11615.929688        0.007194


In [5]:
merged_df = pd.merge(news_df, dow_df[['Date', 'Percent_Change']], on='Date')

merged_df.head()

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,...,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25,Percent_Change
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge""",0.0
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo...",0.004093
2,2008-08-12,0,b'Remember that adorable 9-year-old who sang a...,"b""Russia 'ends Georgia operation'""","b'""If we had no sexual harassment we would hav...","b""Al-Qa'eda is losing support in Iraq because ...",b'Ceasefire in Georgia: Putin Outmaneuvers the...,b'Why Microsoft and Intel tried to kill the XO...,b'Stratfor: The Russo-Georgian War and the Bal...,"b""I'm Trying to Get a Sense of This Whole Geor...",...,b'Why Russias response to Georgia was right',"b'Gorbachev accuses U.S. of making a ""serious ...","b'Russia, Georgia, and NATO: Cold War Two'",b'Remember that adorable 62-year-old who led y...,b'War in Georgia: The Israeli connection',b'All signs point to the US encouraging Georgi...,b'Christopher King argues that the US and NATO...,b'America: The New Mexico?',"b""BBC NEWS | Asia-Pacific | Extinction 'by man...",-0.011872
3,2008-08-13,0,b' U.S. refuses Israel weapons to attack Iran:...,"b""When the president ordered to attack Tskhinv...",b' Israel clears troops who killed Reuters cam...,b'Britain\'s policy of being tough on drugs is...,b'Body of 14 year old found in trunk; Latest (...,b'China has moved 10 *million* quake survivors...,"b""Bush announces Operation Get All Up In Russi...",b'Russian forces sink Georgian ships ',...,b'US humanitarian missions soon in Georgia - i...,"b""Georgia's DDOS came from US sources""","b'Russian convoy heads into Georgia, violating...",b'Israeli defence minister: US against strike ...,b'Gorbachev: We Had No Choice',b'Witness: Russian forces head towards Tbilisi...,b' Quarter of Russians blame U.S. for conflict...,b'Georgian president says US military will ta...,b'2006: Nobel laureate Aleksander Solzhenitsyn...,-0.009406
4,2008-08-14,1,b'All the experts admit that we should legalis...,b'War in South Osetia - 89 pictures made by a ...,b'Swedish wrestler Ara Abrahamian throws away ...,b'Russia exaggerated the death toll in South O...,b'Missile That Killed 9 Inside Pakistan May Ha...,"b""Rushdie Condemns Random House's Refusal to P...",b'Poland and US agree to missle defense deal. ...,"b'Will the Russians conquer Tblisi? Bet on it,...",...,"b""Georgia confict could set back Russia's US r...",b'War in the Caucasus is as much the product o...,"b'""Non-media"" photos of South Ossetia/Georgia ...",b'Georgian TV reporter shot by Russian sniper ...,b'Saudi Arabia: Mother moves to block child ma...,b'Taliban wages war on humanitarian aid workers',"b'Russia: World ""can forget about"" Georgia\'s...",b'Darfur rebels accuse Sudan of mounting major...,b'Philippines : Peace Advocate say Muslims nee...,0.007194


In [6]:
def clean_headline(text):
    if isinstance(text, str):
        text = text.replace('b"', '').replace("b'", '').replace('"', '').replace("'", '')
        return text
    return str(text)

# Apply cleaning
headlines = merged_df.iloc[:, 2:27].apply(
    lambda row: ' '.join(clean_headline(x) for x in row.values), axis=1)
merged_df['Combined_News'] = headlines

In [7]:
merged_df = merged_df[['Date', 'Label', 'Combined_News', 'Percent_Change']]
merged_df.dropna(inplace=True)

In [8]:
merged_df.head()

Unnamed: 0,Date,Label,Combined_News,Percent_Change
0,2008-08-08,0,Georgia downs two Russian warplanes as countri...,0.0
1,2008-08-11,1,Why wont America and Nato help us? If they won...,0.004093
2,2008-08-12,0,Remember that adorable 9-year-old who sang at ...,-0.011872
3,2008-08-13,0,U.S. refuses Israel weapons to attack Iran: r...,-0.009406
4,2008-08-14,1,All the experts admit that we should legalise ...,0.007194


In [9]:
# train = merged_df[(merged_df['Date'] >= '2008-08-08') & (merged_df['Date'] <= '2014-12-31')]
# test = merged_df[(merged_df['Date'] >= '2015-01-02') & (merged_df['Date'] <= '2016-07-01')]

In [10]:
analyzer = SentimentIntensityAnalyzer()
merged_df['Sentiment'] = merged_df['Combined_News'].apply(
    lambda x: analyzer.polarity_scores(x)['compound'])


In [11]:
# 3. Time-Based Split (No Future Leakage)
train_size = int(0.8 * len(merged_df))
train = merged_df.iloc[:train_size]
test = merged_df.iloc[train_size:]


In [12]:
# 4. Feature Engineering Pipeline
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=10000,
    ngram_range=(1, 3),
    min_df=3,
    max_df=0.9
)

In [13]:
# Dimensionality reduction
svd = TruncatedSVD(n_components=100)

In [14]:
# 5. Model Training with Cross-Validation
models = {
    "Logistic Regression": make_pipeline(
        tfidf,
        svd,
        StandardScaler(with_mean=False),
        LogisticRegression(max_iter=1000, class_weight='balanced')
    ),
    "Random Forest": make_pipeline(
        tfidf,
        RandomForestClassifier(n_estimators=200, class_weight='balanced')
    ),
    "XGBoost": make_pipeline(
        tfidf,
        xgb.XGBClassifier(
            n_estimators=200,
            scale_pos_weight=len(train[train['Label']==0])/len(train[train['Label']==1]),
            use_label_encoder=False,
            eval_metric='logloss'
        )
    )
}

In [15]:
# Time-series cross-validation
tscv = TimeSeriesSplit(n_splits=5)
results = {}

for name, model in models.items():
    print(f"\n=== Training {name} ===")
    cv_scores = []
    for train_idx, val_idx in tscv.split(train):
        X_train, X_val = train.iloc[train_idx]['Combined_News'], train.iloc[val_idx]['Combined_News']
        y_train, y_val = train.iloc[train_idx]['Label'], train.iloc[val_idx]['Label']
        
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        acc = accuracy_score(y_val, preds)
        cv_scores.append(acc)
        print(f"Fold accuracy: {acc:.3f}")
    
    avg_acc = np.mean(cv_scores)
    results[name] = avg_acc
    print(f"{name} Average CV Accuracy: {avg_acc:.3f}")


=== Training Logistic Regression ===
Fold accuracy: 0.536
Fold accuracy: 0.521
Fold accuracy: 0.562
Fold accuracy: 0.453
Fold accuracy: 0.528
Logistic Regression Average CV Accuracy: 0.520

=== Training Random Forest ===
Fold accuracy: 0.502
Fold accuracy: 0.506
Fold accuracy: 0.483
Fold accuracy: 0.513
Fold accuracy: 0.532
Random Forest Average CV Accuracy: 0.507

=== Training XGBoost ===


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold accuracy: 0.521


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold accuracy: 0.517


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold accuracy: 0.491


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold accuracy: 0.525


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold accuracy: 0.543
XGBoost Average CV Accuracy: 0.519


In [16]:
# 6. Final Evaluation on Test Set
print("\n=== Final Test Evaluation ===")
best_model_name = max(results, key=results.get)
best_model = models[best_model_name]
best_model.fit(train['Combined_News'], train['Label'])
test_preds = best_model.predict(test['Combined_News'])

print(f"\nBest Model: {best_model_name}")
print(f"Test Accuracy: {accuracy_score(test['Label'], test_preds):.3f}")
print(classification_report(test['Label'], test_preds))


=== Final Test Evaluation ===

Best Model: Logistic Regression
Test Accuracy: 0.503
              precision    recall  f1-score   support

           0       0.49      0.35      0.41       196
           1       0.51      0.65      0.57       202

    accuracy                           0.50       398
   macro avg       0.50      0.50      0.49       398
weighted avg       0.50      0.50      0.49       398



In [17]:
# 7. Feature Importance Analysis (if using tree-based model)
if hasattr(best_model.steps[-1][1], 'feature_importances_'):
    feature_names = tfidf.get_feature_names_out()
    importances = best_model.steps[-1][1].feature_importances_
    top_features = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False).head(20)
    
    print("\nTop 20 Important Features:")
    print(top_features)

In [18]:
# TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_tfidf = tfidf.fit_transform(train['Combined_News'])
X_test_tfidf = tfidf.transform(test['Combined_News'])

# Combine with sentiment
X_train = hstack([X_train_tfidf, np.array(train['Sentiment']).reshape(-1, 1)])
X_test = hstack([X_test_tfidf, np.array(test['Sentiment']).reshape(-1, 1)])
y_train = train['Label']
y_test = test['Label']

NameError: name 'hstack' is not defined

In [None]:
# ==================== MODELS ====================

# 1. Logistic Regression
print("\nðŸ”¹ Logistic Regression")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, lr_preds):.2f}")
print(classification_report(y_test, lr_preds))

# 2. Random Forest
print("\nðŸ”¹ Random Forest Classifier")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_preds = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, rf_preds):.2f}")
print(classification_report(y_test, rf_preds))

# 3. XGBoost
print("\nðŸ”¹ XGBoost Classifier")
xgb_model = xgb.XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_preds = xgb_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, xgb_preds):.2f}")
print(classification_report(y_test, xgb_preds))

# Optional: See performance summary
summary = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'XGBoost'],
    'Accuracy': [
        accuracy_score(y_test, lr_preds),
        accuracy_score(y_test, rf_preds),
        accuracy_score(y_test, xgb_preds)
    ]
})
print("\nðŸ”Ž Model Performance Summary:")
print(summary)



ðŸ”¹ Logistic Regression
Accuracy: 0.47
              precision    recall  f1-score   support

           0       0.41      0.17      0.24       196
           1       0.49      0.77      0.60       202

    accuracy                           0.47       398
   macro avg       0.45      0.47      0.42       398
weighted avg       0.45      0.47      0.42       398


ðŸ”¹ Random Forest Classifier
Accuracy: 0.47
              precision    recall  f1-score   support

           0       0.44      0.23      0.31       196
           1       0.49      0.71      0.58       202

    accuracy                           0.47       398
   macro avg       0.46      0.47      0.44       398
weighted avg       0.46      0.47      0.44       398


ðŸ”¹ XGBoost Classifier


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.46
              precision    recall  f1-score   support

           0       0.43      0.31      0.36       196
           1       0.47      0.61      0.53       202

    accuracy                           0.46       398
   macro avg       0.45      0.46      0.45       398
weighted avg       0.45      0.46      0.45       398


ðŸ”Ž Model Performance Summary:
                 Model  Accuracy
0  Logistic Regression  0.472362
1        Random Forest  0.474874
2              XGBoost  0.459799


In [None]:
# === Regression ===
print("\n=== Percentage Change Prediction (Regression) ===")
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train, y_reg_train)
reg_preds = reg.predict(X_test)


=== Percentage Change Prediction (Regression) ===


ValueError: Found input variables with inconsistent numbers of samples: [1591, 1611]

In [None]:
print(f"MSE: {mean_squared_error(y_reg_test, reg_preds):.4f}")
print(f"RÂ² Score: {r2_score(y_reg_test, reg_preds):.2f}")

MSE: 0.0001
RÂ² Score: -0.08


In [None]:
# Show sample predictions
sample_results = pd.DataFrame({
    'Date': test['Date'].values,
    'Combined News': test['Combined_News'].values,
    'Actual Change': y_reg_test.values,
    'Predicted Change': reg_preds,
    'Actual Label': y_class_test.values,
    'Predicted Label': class_preds
}).sample(5, random_state=42)

print("\nSample Predictions:")
print(sample_results)