# Import Required Libraries
Import the necessary libraries, including pandas, numpy, seaborn, matplotlib, and sklearn.

In [1]:
# Importing required libraries

# pandas for data manipulation
import pandas as pd

# numpy for numerical computations
import numpy as np

# seaborn for statistical data visualization
import seaborn as sns

# matplotlib for creating static, animated, and interactive visualizations in Python
import matplotlib.pyplot as plt

# sklearn for machine learning and data processing
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

# Load and Inspect the Dataset
Load the dataset using pandas and perform initial data inspection.

In [None]:
# Load and Inspect the Dataset

# Load the dataset using pandas
df = pd.read_csv('stock_data.csv')

# Display the first 5 rows of the dataframe
print(df.head())

# Display the shape of the dataframe
print('Shape of the dataframe:', df.shape)

# Display the column names
print('Columns in the dataframe:', df.columns)

# Display the data types of each column
print('Data types of the columns:')
print(df.dtypes)

# Check for missing values in the dataframe
print('Missing values in the dataframe:')
print(df.isnull().sum())

# Display the summary statistics of the dataframe
print('Summary statistics of the dataframe:')
print(df.describe())

# Data Preprocessing
Handle missing values, convert data types if necessary, and perform any other necessary preprocessing steps.

In [None]:
# Data Preprocessing

# Handling missing values
# If any column has more than 50% data missing, we drop the column
half_count = len(df) / 2
df = df.dropna(thresh=half_count, axis=1)

# For the remaining missing values, we fill them with the mean of the respective column
df = df.fillna(df.mean())

# Convert data types if necessary
# Here we ensure that all numerical columns are of float type
numerical_cols = df.select_dtypes(include=[np.number]).columns
df[numerical_cols] = df[numerical_cols].astype(float)

# Check if there are any categorical variables. If yes, convert them to dummy variables
categorical_cols = df.select_dtypes(include=[np.object]).columns
df = pd.get_dummies(df, columns=categorical_cols)

# Display the first 5 rows of the preprocessed dataframe
print(df.head())

# Display the shape of the preprocessed dataframe
print('Shape of the preprocessed dataframe:', df.shape)

# Display the column names of the preprocessed dataframe
print('Columns in the preprocessed dataframe:', df.columns)

# Display the data types of each column in the preprocessed dataframe
print('Data types of the columns in the preprocessed dataframe:')
print(df.dtypes)

# Check for missing values in the preprocessed dataframe
print('Missing values in the preprocessed dataframe:')
print(df.isnull().sum())

# Display the summary statistics of the preprocessed dataframe
print('Summary statistics of the preprocessed dataframe:')
print(df.describe())

# Feature Engineering
Create new features that might be useful for the prediction task.

In [None]:
# Feature Engineering

# Create a new feature 'price_change' which is the difference between the closing price of the current day and the closing price of the previous day
df['price_change'] = df['Close'].diff()

# Create a new feature 'price_change_percentage' which is the percentage change in closing price from the previous day
df['price_change_percentage'] = df['Close'].pct_change() * 100

# Create a new feature 'price_volatility' which is the standard deviation of the closing price over the past 5 days
df['price_volatility'] = df['Close'].rolling(window=5).std()

# Create a new feature 'volume_change' which is the difference between the volume of the current day and the volume of the previous day
df['volume_change'] = df['Volume'].diff()

# Create a new feature 'volume_change_percentage' which is the percentage change in volume from the previous day
df['volume_change_percentage'] = df['Volume'].pct_change() * 100

# Create a new feature 'volume_volatility' which is the standard deviation of the volume over the past 5 days
df['volume_volatility'] = df['Volume'].rolling(window=5).std()

# Create a new feature 'high_low_spread' which is the difference between the high price and the low price of the day
df['high_low_spread'] = df['High'] - df['Low']

# Create a new feature 'close_open_spread' which is the difference between the closing price and the opening price of the day
df['close_open_spread'] = df['Close'] - df['Open']

# Drop the rows with missing values that were created due to feature engineering
df = df.dropna()

# Display the first 5 rows of the dataframe after feature engineering
print(df.head())

# Display the shape of the dataframe after feature engineering
print('Shape of the dataframe after feature engineering:', df.shape)

# Display the column names of the dataframe after feature engineering
print('Columns in the dataframe after feature engineering:', df.columns)

# Display the data types of each column in the dataframe after feature engineering
print('Data types of the columns in the dataframe after feature engineering:')
print(df.dtypes)

# Check for missing values in the dataframe after feature engineering
print('Missing values in the dataframe after feature engineering:')
print(df.isnull().sum())

# Display the summary statistics of the dataframe after feature engineering
print('Summary statistics of the dataframe after feature engineering:')
print(df.describe())

# Data Standardization
Standardize the features to have zero mean and unit variance using sklearn's StandardScaler.

In [None]:
# Importing the StandardScaler from sklearn
from sklearn.preprocessing import StandardScaler

# Instantiate the StandardScaler
scaler = StandardScaler()

# Fit and transform the data
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# Display the first 5 rows of the standardized dataframe
print(df_scaled.head())

# Display the shape of the standardized dataframe
print('Shape of the standardized dataframe:', df_scaled.shape)

# Display the column names of the standardized dataframe
print('Columns in the standardized dataframe:', df_scaled.columns)

# Display the data types of each column in the standardized dataframe
print('Data types of the columns in the standardized dataframe:')
print(df_scaled.dtypes)

# Check for missing values in the standardized dataframe
print('Missing values in the standardized dataframe:')
print(df_scaled.isnull().sum())

# Display the summary statistics of the standardized dataframe
print('Summary statistics of the standardized dataframe:')
print(df_scaled.describe())

# Correlation Analysis
Perform correlation analysis to identify which features are highly correlated with the 'high' or 'close' of the stock.

In [None]:
# Correlation Analysis

# Calculate the correlation matrix
corr_matrix = df_scaled.corr()

# Display the correlation matrix
print('Correlation Matrix:')
print(corr_matrix)

# Plot the correlation matrix using a heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

# Identify features that are highly correlated with 'High' and 'Close'
high_corr = corr_matrix['High'].sort_values(ascending=False)
close_corr = corr_matrix['Close'].sort_values(ascending=False)

# Display the features highly correlated with 'High'
print('Features highly correlated with High:')
print(high_corr[high_corr > 0.5])

# Display the features highly correlated with 'Close'
print('Features highly correlated with Close:')
print(close_corr[close_corr > 0.5])