In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#darren's data
amd_hist = pd.read_csv('data/AMD_data_retrieval.csv')
nvda_hist = pd.read_csv('data/NVDA_data_retrieval.csv')
intc_hist = pd.read_csv('data/INTC_data_retrieval.csv')

#tech indicators
amd_tech = pd.read_csv('data/AMD_technical_indicators.csv')
nvda_tech = pd.read_csv('data/NVDA_technical_indicators.csv')
intc_tech = pd.read_csv('data/INTC_technical_indicators.csv')

#sentiments
amd_sent = pd.read_csv('data/AMD_sentiments.csv')
nvda_sent = pd.read_csv('data/NVDA_sentiments.csv')
intc_sent = pd.read_csv('data/INTC_sentiments.csv')

In [3]:
amd_hist.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker'],
      dtype='object')

In [5]:
amd_sent.columns

Index(['Unnamed: 0', 'title', 'description', 'source', 'url',
       'sentiment_score'],
      dtype='object')

In [15]:
amd_tech.columns

Index(['Date', 'RSI', 'ATR', 'ADX', 'MACD'], dtype='object')

In [3]:
amd_sent['Ticker'] = 'AMD' #allows us to merge later
nvda_sent['Ticker'] = 'NVDA'
intc_sent['Ticker'] = 'INTC'

In [4]:
amd_merged = amd_hist.merge(amd_tech, on='Date', how='left')
nvda_merged = nvda_hist.merge(nvda_tech, on='Date', how='left')
intc_merged = intc_hist.merge(intc_tech, on='Date', how='left')



In [10]:
amd_merged.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker', 'RSI', 'ATR', 'ADX', 'MACD'],
      dtype='object')

In [5]:
amd_sent.columns

Index(['Unnamed: 0', 'title', 'description', 'source', 'url',
       'sentiment_score', 'Ticker'],
      dtype='object')

In [6]:
#merging historical data and technical indicators with sentiments
amd_merged = amd_merged.merge(amd_sent, on='Ticker', how='left')
nvda_merged = nvda_merged.merge(nvda_sent, on='Ticker', how='left')
intc_merged = intc_merged.merge(intc_sent, on='Ticker', how='left')

In [8]:
combined_data = pd.concat([amd_merged, nvda_merged, intc_merged])


In [9]:
combined_data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker', 'RSI', 'ATR', 'ADX', 'MACD', 'Unnamed: 0', 'title',
       'description', 'source', 'url', 'sentiment_score'],
      dtype='object')

In [10]:
combined_data.shape

(2804065, 23)

In [19]:
combined_data.head()

Unnamed: 0.1,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Year,P/E Ratio,...,RSI,ATR,ADX,MACD,Unnamed: 0,title,description,source,url,sentiment_score
0,1980-03-17 00:00:00-05:00,0.0,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,...,,,,,0,Advanced Micro Devices (AMD) Accelerates AI De...,We recently published a list of 15 AI News Upd...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.2
1,1980-03-17 00:00:00-05:00,0.0,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,...,,,,,1,Is AMD Stock Finally Better Than NVDA Stock Af...,Advanced Micro Devices (NASDAQ:AMD) stock has ...,Yahoo Entertainment,https://finance.yahoo.com/news/amd-stock-final...,0.28
2,1980-03-17 00:00:00-05:00,0.0,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,...,,,,,2,Bezos Backs AI Chipmaker Vying With Nvidia at ...,(Bloomberg) -- Amazon.com Inc. founder Jeff Be...,Yahoo Entertainment,https://finance.yahoo.com/news/bezos-backs-ai-...,0.0
3,1980-03-17 00:00:00-05:00,0.0,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,...,,,,,3,Advanced Micro Devices (AMD) Shifts to Unified...,We recently published a list of 15 AI News You...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.2
4,1980-03-17 00:00:00-05:00,0.0,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,...,,,,,4,"Advanced Micro Devices, Inc. (AMD): Eyeing $50...",We recently compiled a list of the 15 AI Stock...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.2


In [26]:
combined_data.shape

(2804065, 23)

In [None]:
print(combined_data['RSI'].sum(skipna=True))
print(combined_data['ATR'].sum(skipna=True))
print(combined_data['ADX'].sum(skipna=True))
print(combined_data['MACD'].sum(skipna=True)) #bad cols


0.0
0.0
0.0
0.0


In [12]:
bad=['ATR','ADX','MACD','RSI']
#drop bad columns
combined_data = combined_data.drop(columns=bad)

KeyError: "['ATR', 'ADX', 'MACD', 'RSI'] not found in axis"

In [13]:
combined_data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker', 'title', 'description', 'source', 'url', 'sentiment_score'],
      dtype='object')

In [14]:
combined_data['Unnamed: 0'].value_counts() #ngl i got no idea what this column is

KeyError: 'Unnamed: 0'

In [15]:
combined_data = combined_data.drop(columns='Unnamed: 0') #drop it

KeyError: "['Unnamed: 0'] not found in axis"

In [16]:
combined_data.columns #everything here looks good and can be used.

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker', 'title', 'description', 'source', 'url', 'sentiment_score'],
      dtype='object')

In [17]:
combined_data.dropna(inplace=True)  #drop rows with missing values

In [45]:
combined_data.shape

(2804065, 18)

In [46]:
combined_data

Unnamed: 0,Date,Open,High,Low,Close,Volume,Dividends,Stock Splits,Year,P/E Ratio,Dividend Yield,Market Cap,Ticker,title,description,source,url,sentiment_score
0,1980-03-17 00:00:00-05:00,0.000000,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,0.0,5.105089e+09,AMD,Advanced Micro Devices (AMD) Accelerates AI De...,We recently published a list of 15 AI News Upd...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.200000
1,1980-03-17 00:00:00-05:00,0.000000,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,0.0,5.105089e+09,AMD,Is AMD Stock Finally Better Than NVDA Stock Af...,Advanced Micro Devices (NASDAQ:AMD) stock has ...,Yahoo Entertainment,https://finance.yahoo.com/news/amd-stock-final...,0.280000
2,1980-03-17 00:00:00-05:00,0.000000,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,0.0,5.105089e+09,AMD,Bezos Backs AI Chipmaker Vying With Nvidia at ...,(Bloomberg) -- Amazon.com Inc. founder Jeff Be...,Yahoo Entertainment,https://finance.yahoo.com/news/bezos-backs-ai-...,0.000000
3,1980-03-17 00:00:00-05:00,0.000000,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,0.0,5.105089e+09,AMD,Advanced Micro Devices (AMD) Shifts to Unified...,We recently published a list of 15 AI News You...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.200000
4,1980-03-17 00:00:00-05:00,0.000000,3.302083,3.125,3.145833,219600,0.0,0.0,1980,2.783923,0.0,5.105089e+09,AMD,"Advanced Micro Devices, Inc. (AMD): Eyeing $50...",We recently compiled a list of the 15 AI Stock...,Yahoo Entertainment,https://finance.yahoo.com/news/advanced-micro-...,0.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115032,2024-11-15 00:00:00-05:00,24.809999,24.820000,24.230,24.350000,54931400,0.0,0.0,2024,-6.510695,0.0,1.050216e+11,INTC,Lenovo Launches ThinkShield Firmware Assurance...,Lenovo Launches ThinkShield Firmware Assurance...,Investing.com,https://www.investing.com/news/press-releases/...,0.000000
1115033,2024-11-15 00:00:00-05:00,24.809999,24.820000,24.230,24.350000,54931400,0.0,0.0,2024,-6.510695,0.0,1.050216e+11,INTC,ath9k (wi-fi) driver doesn't work properly wit...,i have an issue with my WIFI\nit works fine af...,Askubuntu.com,https://askubuntu.com/questions/1531967/ath9k-...,0.054167
1115034,2024-11-15 00:00:00-05:00,24.809999,24.820000,24.230,24.350000,54931400,0.0,0.0,2024,-6.510695,0.0,1.050216e+11,INTC,Wi-Fi Chipsets Global Strategic Business Repor...,"Dublin, Nov. 19, 2024 (GLOBE NEWSWIRE) -- The ...",GlobeNewswire,https://www.globenewswire.com/news-release/202...,0.000000
1115035,2024-11-15 00:00:00-05:00,24.809999,24.820000,24.230,24.350000,54931400,0.0,0.0,2024,-6.510695,0.0,1.050216e+11,INTC,"Vietnam Data Center Networking Market Trends, ...","Dublin, Nov. 25, 2024 (GLOBE NEWSWIRE) -- The ...",GlobeNewswire,https://www.globenewswire.com/news-release/202...,-0.100000


In [14]:
combined_data.to_csv('data/combined_data.csv')

## YOU DONT NEED TO RUN PAST THIS STEP FOR VISUALIZATION PEOPLE

In [19]:
combined_data = combined_data.iloc[:-1]


In [21]:
#combined_data['Price_Change'] = combined_data['Close'].pct_change()
combined_data['Price_Increase'] = (combined_data['Close'].shift(-1) > combined_data['Close']).astype(int)


In [24]:
combined_data['Price_Increase'].value_counts()

Price_Increase
0    2789841
1      14222
Name: count, dtype: int64

In [28]:
combined_data.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Dividends',
       'Stock Splits', 'Year', 'P/E Ratio', 'Dividend Yield', 'Market Cap',
       'Ticker', 'title', 'description', 'source', 'url', 'sentiment_score',
       'Price_Increase'],
      dtype='object')

In [43]:
combined_data['Price_Change'] = combined_data['Close'].pct_change()
combined_data['SMA_5'] = combined_data['Close'].rolling(window=5).mean()
combined_data['SMA_10'] = combined_data['Close'].rolling(window=10).mean()
combined_data['SMA_20'] = combined_data['Close'].rolling(window=20).mean()

combined_data = combined_data.dropna()

#select key columns needed for the model
key_columns = ['Open', 'High', 'Low', 'Close', 'Volume', 'Price_Change', 'SMA_5', 'SMA_10', 'SMA_20', 'Price_Increase']
filtered_data = combined_data[key_columns]


In [44]:
from sklearn.utils import resample #to balance the data
df_majority = filtered_data[filtered_data.Price_Increase == 0]
df_minority = filtered_data[filtered_data.Price_Increase == 1]

df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=42) # reproducible results

#combine majority class with upsampled minority class
filtered_data_upsampled = pd.concat([df_majority, df_minority_upsampled])

In [49]:
#x and y for train test split
X = filtered_data.drop(columns=['Price_Increase'])
y = filtered_data['Price_Increase']

In [50]:
from sklearn.model_selection import train_test_split

#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

#initialize the logistic regression model
model = LogisticRegression(max_iter=1000) #setting an upper bound so i dont run the model for like 5 hours

#train the model on the training data
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
#predict the labels of the test set
y_pred = model.predict(X_test) #pretty accurate!
print(max(y_pred)) #0 therefore the model is predicting the price will not increase
# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

#print the classification report
print(classification_report(y_test, y_pred, zero_division=0))


0
Accuracy: 0.994928754709714
              precision    recall  f1-score   support

           0       0.99      1.00      1.00    557965
           1       0.00      0.00      0.00      2844

    accuracy                           0.99    560809
   macro avg       0.50      0.50      0.50    560809
weighted avg       0.99      0.99      0.99    560809

