In [13]:
import pandas as pd
import numpy as np 
from datetime import datetime
import yfinance as yf
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [14]:
#this project invloves data clean and analysis of the stock markets using python.
# It includes stock prices for Apple, Microsoft,Netflix and Google stock prices
# Identifying trends and patterns in stock price movements, calculating moving averages and volatility for each company, and conduct correlation analysis to examine the relationships between different stock prices.

In [15]:
                       #reading the csv file
stock_data = pd.read_csv("stocks.csv")


In [16]:
                        #data cleaning
describe = stock_data.describe()

#checking for null data for the data set
null_data = stock_data.isnull()
has_null_data = null_data.values.any()

#checking for duplicates in dates for respective companies 
tickers = ["AAPL", "MSFT","GOOG","NFLX"]
data_filtered = stock_data[stock_data["Ticker"].isin(tickers)][["Ticker","Date"]]
duplicates = data_filtered.duplicated(subset=['Ticker', 'Date'])
duplicate_rows = data_filtered[duplicates]


In [28]:
                             #drawing graphs to understand the stock proce visiually 
import plotly.express as ply
line_graph_volume = ply.line(stock_data, x='Date', y='Volume', color='Ticker',
              labels={'Date':'Date', 'Close':'Closing Price', 'Ticker':'Company'},
              title='Stock Prices volume over time')
line_graph_close = ply.line(stock_data, x = "Date", y = "Close", color = "Ticker",
                           labels = {"Date" : "Date", "Close": "Closing Price", "Ticker" : "Company"},
                           title = "Stock price closing price over time")
line_graph_open = ply.line(stock_data, x = "Date", y = "Open", color = "Ticker",
                           labels = {"Date" : "Date", "Open": "Opening Price", "Ticker" : "Company"},
                           title = "Stock price opening price over time")
line_graph_high = ply.line(stock_data, x = "Date", y = "High", color = "Ticker",
                           labels = {"Date" : "Date", "High": "Highest Price", "Ticker" : "Company"},
                           title = "Stock price peak for each day")
line_graph_low = ply.line(stock_data, x = "Date", y = "Low", color = "Ticker",
                           labels = {"Date" : "Date", "Low": "Lowest Price", "Ticker" : "Company"},
                           title = "Lowest stock price for each day")

line_graph_volume.show()
line_graph_open.show()
line_graph_close.show()
line_graph_high.show()
line_graph_low.show()

None


None


None


None


None


In [27]:
#calculating moving averages for each company 
stock_data["MA10"] = stock_data.groupby("Ticker")["Close"].rolling(window = 10).mean().reset_index(0,drop = True)
stock_data["MA20"] = stock_data.groupby("Ticker")["Close"].rolling(window = 20).mean().reset_index(0,drop = True)

for ticker,group in stock_data.groupby("Ticker"):
    line_MA = ply.line(group, x = "Date", y = ["Open", "MA10", "MA20"], title = f"{ticker}'s Moving Averages")
    line_MA.show()



None


None


None


None


In [25]:
# calculating volatility for the 4 companies respectively
stock_data["Volatility"] = stock_data.groupby("Ticker")["Close"].pct_change().rolling(window = 10).std().reset_index(0, drop = True)
line_volatility = ply.line(stock_data, x = "Date", y = "Volatility", color = "Ticker", title = "Volatility")
line_volatility.show()

None


In [26]:
# Correlation between apple and all other stocks and we want to invest in apple
apple = stock_data.loc[stock_data["Ticker"] == "AAPL", ["Date", "Close"]].rename(columns = {"Close": "AAPL"})
microsoft = stock_data.loc[stock_data["Ticker"] == "MSFT", ["Date", "Close"]].rename(columns = {"Close": "MSFT"})
netflix = stock_data.loc[stock_data["Ticker"] == "NFLX", ["Date", "Close"]].rename(columns = {"Close": "NFLX"})
google = stock_data.loc[stock_data["Ticker"] == "GOOG", ["Date", "Close"]].rename(columns = {"Close": "GOOG"})

#merging companies' data with apple's data to make graphs to better understand the correlation between them
aapl_msft= pd.merge(apple, microsoft, on = "Date")
aapl_nflx= pd.merge(apple, netflix, on = "Date")
aapl_goog= pd.merge(apple, google, on = "Date")

#scatter plots
a_m_correlation = ply.scatter(aapl_msft, x='AAPL', y='MSFT', 
                 trendline='ols', 
                 title='Correlation between Apple and Microsoft')

a_n_correlation = ply.scatter(aapl_nflx, x='AAPL', y='NFLX', 
                 trendline='ols', 
                 title='Correlation between Apple and Netflix')

a_g_correlation = ply.scatter(aapl_goog, x='AAPL', y='GOOG', 
                 trendline='ols', 
                 title='Correlation between Apple and Google')
a_m_correlation.show()
a_n_correlation.show()
a_g_correlation.show()

None


None


None


In [24]:
                                                            # RSI index for each company 
# Calculate price differences
stock_data["PriceDiff"] = stock_data["Close"].diff()

# Separate gains and losses
stock_data["Gain"] = stock_data["PriceDiff"].where(stock_data["PriceDiff"] > 0, 0)
stock_data["Loss"] = -stock_data["PriceDiff"].where(stock_data["PriceDiff"] < 0, 0)

# Calculate average gains and losses over a 20 day period
period = 20
stock_data["AvgGain"] = stock_data["Gain"].rolling(window=period).mean()
stock_data["AvgLoss"] = stock_data["Loss"].rolling(window=period).mean()

# Calculate relative strength (RS)
stock_data["RS"] = stock_data["AvgGain"] / stock_data["AvgLoss"]

# Calculate the RSI
stock_data["RSI"] = 100 - (100 / (1 + stock_data["RS"]))

tickers = ["AAPL", "MSFT", "GOOG", "NFLX"]
filtered_data = stock_data[stock_data["Ticker"].isin(tickers)]

# Plot the RSI for each company
fig = ply.line(filtered_data, x="Date", y="RSI", 
              color="Ticker",
              title="RSI for Different Companies")
fig.show()


None


In [22]:
# Machine Learning Model

# List of tickers (companies) in the CSV file
tickers = ["AAPL", "MSFT", "GOOG", "NFLX"]

# Create an empty dictionary to store the predictions
predictions = {}

# Iterate over each ticker and make predictions
for ticker in tickers:
    # Extract the relevant features and target variable for the current ticker
    features = stock_data.loc[stock_data['Ticker'] == ticker, ['Open', 'High', 'Low', 'Volume']].tail(1)
    target = stock_data.loc[stock_data['Ticker'] == ticker, 'Close'].tail(1)
    
    model = LinearRegression()
    
    # Train the model on the data
    model.fit(features, target)
    
    # Make a prediction using the same data
    prediction = model.predict(features)
    
    # Get the latest date for the prediction
    latest_date = stock_data.loc[stock_data['Ticker'] == ticker, 'Date'].tail(1).values[0]
    
    # Store the prediction in the dictionary
    predictions[ticker] = {"Date": latest_date, "Prediction": prediction[0]}

# Print the predicted stock prices with timeline for each company
for ticker, data in predictions.items():
    print(f"Predicted Stock Price for {ticker} on {data['Date']}: {data['Prediction']}")

Predicted Stock Price for AAPL on 2023-05-05: 173.57000732421875
Predicted Stock Price for MSFT on 2023-05-05: 310.6499938964844
Predicted Stock Price for GOOG on 2023-05-05: 106.21499633789062
Predicted Stock Price for NFLX on 2023-05-05: 322.760009765625
