In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

file_path = 'Combined_News_DJIA.csv'
df = pd.read_csv(file_path)

# Combine all news headlines for each day into a single string
df['Combined_News'] = df.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the combined news headlines
df['Processed_News'] = df['Combined_News'].apply(preprocess_text)

# Features (TF-IDF Vectorization of the processed news) and Target (Label)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for optimization
X = tfidf_vectorizer.fit_transform(df['Processed_News']).toarray()
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use Logistic Regression for classification
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Output results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_rep)

# Test case: Predicting stock movement based on new input text
test_headlines = """
Apple announces new iPhone; Stock surges.
Tesla shares rise after strong earnings report.
Market reacts positively to government fiscal policy.
"""
# Preprocess the test input
test_input_processed = preprocess_text(test_headlines)
test_input_tfidf = tfidf_vectorizer.transform([test_input_processed])

# Predict using the model
test_prediction = model.predict(test_input_tfidf)

if test_prediction == 1:
    print("The stock is predicted to increase.")
else:
    print("The stock is predicted to decrease.")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


Accuracy: 0.5175879396984925
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.33      0.37       171
           1       0.57      0.66      0.61       227

    accuracy                           0.52       398
   macro avg       0.49      0.49      0.49       398
weighted avg       0.50      0.52      0.51       398

The stock is predicted to decrease.


In [3]:
# prompt: For the above code use XG boost and try to increase the accuracy and write the complete code again

import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier  # Import XGBoost Classifier
from sklearn.metrics import accuracy_score, classification_report
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

file_path = 'Combined_News_DJIA.csv'
df = pd.read_csv(file_path)

# Combine all news headlines for each day into a single string
df['Combined_News'] = df.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the combined news headlines
df['Processed_News'] = df['Combined_News'].apply(preprocess_text)

# Features (TF-IDF Vectorization of the processed news) and Target (Label)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for optimization
X = tfidf_vectorizer.fit_transform(df['Processed_News']).toarray()
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use XGBoost Classifier for classification
model = XGBClassifier(random_state=42)  # Initialize XGBoost Classifier
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Output results
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_rep)

# Test case: Predicting stock movement based on new input text
test_headlines = """
Apple announces new iPhone; Stock surges.
Tesla shares rise after strong earnings report.
Market reacts positively to government fiscal policy.
"""
# Preprocess the test input
test_input_processed = preprocess_text(test_headlines)
test_input_tfidf = tfidf_vectorizer.transform([test_input_processed])

# Predict using the model
test_prediction = model.predict(test_input_tfidf)

if test_prediction == 1:
    print("The stock is predicted to increase.")
else:
    print("The stock is predicted to decrease.")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.507537688442211
Classification Report:
              precision    recall  f1-score   support

           0       0.43      0.49      0.46       171
           1       0.57      0.52      0.55       227

    accuracy                           0.51       398
   macro avg       0.50      0.50      0.50       398
weighted avg       0.51      0.51      0.51       398

The stock is predicted to increase.


In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

file_path = 'Combined_News_DJIA.csv'
df = pd.read_csv(file_path)

# Combine all news headlines for each day into a single string
df['Combined_News'] = df.iloc[:, 2:27].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

def preprocess_text(text):

    text = text.lower()

    text = re.sub(r'[^a-z\s]', '', text)

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    return ' '.join(tokens)

# Apply preprocessing to the combined news headlines
df['Processed_News'] = df['Combined_News'].apply(preprocess_text)

# Features (TF-IDF Vectorization of the processed news) and Target (Label)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for optimization
X = tfidf_vectorizer.fit_transform(df['Processed_News']).toarray()
y = df['Label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_y_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_classification_rep = classification_report(y_test, rf_y_pred)

print("Random Forest Results:")
print(f'Accuracy: {rf_accuracy}')
print('Classification Report:')
print(rf_classification_rep)

# Gradient Boosting Classifier
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
gb_y_pred = gb_model.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_classification_rep = classification_report(y_test, gb_y_pred)

print("\nGradient Boosting Results:")
print(f'Accuracy: {gb_accuracy}')
print('Classification Report:')
print(gb_classification_rep)

# Test case: Predicting stock movement based on new input text (using Random Forest)
test_headlines = """
Apple announces new iPhone; Stock surges.
Tesla shares rise after strong earnings report.
Market reacts positively to government fiscal policy.
"""
# Preprocess the test input
test_input_processed = preprocess_text(test_headlines)
test_input_tfidf = tfidf_vectorizer.transform([test_input_processed])

# Predict using the Random Forest model
test_prediction = rf_model.predict(test_input_tfidf)

if test_prediction == 1:
    print("\nThe stock is predicted to increase (Random Forest).")
else:
    print("\nThe stock is predicted to decrease (Random Forest).")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Random Forest Results:
Accuracy: 0.49748743718592964
Classification Report:
              precision    recall  f1-score   support

           0       0.41      0.36      0.38       171
           1       0.56      0.60      0.58       227

    accuracy                           0.50       398
   macro avg       0.48      0.48      0.48       398
weighted avg       0.49      0.50      0.49       398


Gradient Boosting Results:
Accuracy: 0.507537688442211
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.38      0.40       171
           1       0.56      0.60      0.58       227

    accuracy                           0.51       398
   macro avg       0.49      0.49      0.49       398
weighted avg       0.50      0.51      0.50       398


The stock is predicted to decrease (Random Forest).
