In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Step 1: Download MSFT stock data from Yahoo Finance
ticker = "MSFT"
data = yf.download(ticker, start="2010-01-01", end="2025-01-01")

# Step 2: Inspect the dataset
print(data.head())
print(data.info())

# Step 3: Handle missing data (if any)
# Check for missing values
print("Missing values per column:")
print(data.isna().sum())

# Fill missing values using forward fill method (you can also use other techniques)
data.fillna(method='ffill', inplace=True)

# Step 4: Feature Engineering - Add additional features
# Calculate daily percentage change (percentage change in closing price)
data['Daily Change'] = data['Adj Close'].pct_change()

# Calculate moving averages (e.g., 50-day and 200-day moving averages)
data['50-Day MA'] = data['Adj Close'].rolling(window=50).mean()
data['200-Day MA'] = data['Adj Close'].rolling(window=200).mean()

# Step 5: Drop rows with NaN values (because of moving averages)
data.dropna(inplace=True)

# Step 6: Feature Scaling - Min-Max scaling manually for features
def min_max_scale(df):
    return (df - df.min()) / (df.max() - df.min())

# Scaling the relevant features: "Adj Close", "Daily Change", "50-Day MA", "200-Day MA"
scaled_data = min_max_scale(data[['Adj Close', 'Daily Change', '50-Day MA', '200-Day MA']])

# Step 7: Prepare Training Data
# We are predicting 'Adj Close', so let's use the other columns to predict it.
X = scaled_data[['Daily Change', '50-Day MA', '200-Day MA']].values
y = scaled_data['Adj Close'].values

# Split into training and test sets (90% training, 10% testing)
train_size = int(len(X) * 0.9)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Step 8: Linear Regression without sklearn (Manually)
def linear_regression(X, y):
    # Adding intercept column (ones) to X
    X = np.c_[np.ones(X.shape[0]), X]
    # Normal Equation: theta = (X.T * X)^-1 * X.T * y
    theta = np.linalg.inv(X.T @ X) @ X.T @ y
    return theta

# Train the model
theta = linear_regression(X_train, y_train)

# Step 9: Make Predictions
# Predict on the test set
X_test_with_intercept = np.c_[np.ones(X_test.shape[0]), X_test]
y_pred = X_test_with_intercept @ theta

# Step 10: Visualize the Predictions
plt.figure(figsize=(10, 6))
plt.plot(data.index[train_size:], y_test, label="True Stock Price", color='blue')
plt.plot(data.index[train_size:], y_pred, label="Predicted Stock Price", color='red')
plt.title("Microsoft Stock Price Prediction (1-Year Duration)")
plt.xlabel("Date")
plt.ylabel("Scaled Price")
plt.legend()
plt.show()

# Step 11: Evaluate the Model
# You can compute RMSE (Root Mean Squared Error) as a simple evaluation metric
rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Step 12: Predict for the next year (extend the prediction)
# Use the last 200 days as input to predict the next year
last_200_days = scaled_data[['Daily Change', '50-Day MA', '200-Day MA']].iloc[-200:].values
last_200_days_with_intercept = np.c_[np.ones(last_200_days.shape[0]), last_200_days]
future_predictions = last_200_days_with_intercept @ theta

# Plot Future Predictions
plt.figure(figsize=(10, 6))
plt.plot(data.index, data['Adj Close'], label="Historical Stock Price")
plt.plot(pd.date_range(data.index[-1], periods=252, freq='B'), future_predictions, label="Future Stock Predictions (Next Year)", color='orange')
plt.title("Microsoft Stock Price - Historical & Future Predictions")
plt.xlabel("Date")
plt.ylabel("Price")
plt.legend()
plt.show()


[*********************100%***********************]  1 of 1 completed
  data.fillna(method='ffill', inplace=True)


Price           Close       High        Low       Open    Volume
Ticker           MSFT       MSFT       MSFT       MSFT      MSFT
Date                                                            
2010-01-04  23.300676  23.413603  23.029650  23.052236  38409100
2010-01-05  23.308208  23.413608  23.067297  23.225396  49749600
2010-01-06  23.165167  23.398550  22.976955  23.247979  58182400
2010-01-07  22.924259  23.112471  22.728518  23.059771  50559700
2010-01-08  23.082350  23.247976  22.766153  22.796268  51197400
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3774 entries, 2010-01-04 to 2024-12-31
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   (Close, MSFT)   3774 non-null   float64
 1   (High, MSFT)    3774 non-null   float64
 2   (Low, MSFT)     3774 non-null   float64
 3   (Open, MSFT)    3774 non-null   float64
 4   (Volume, MSFT)  3774 non-null   int64  
dtypes: float64(4), int64(1)
memory usage

KeyError: 'Adj Close'