# Import Required Libraries
Import the necessary libraries, including pandas, numpy, and sklearn.

In [21]:
# Importing the required libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
from sklearn.model_selection import TimeSeriesSplit  # For time series cross-validation
from sklearn.linear_model import LinearRegression  # For the linear regression model
from sklearn.metrics import mean_squared_error  # For calculating the mean squared error
from sklearn.preprocessing import StandardScaler  # For standardizing the features
from matplotlib import pyplot as plt  # For data visualization

# Load and Explore the Dataset
Load the ACN_data.csv file into a pandas DataFrame and perform initial exploration.

In [22]:
# Load the dataset
df = pd.read_csv('individual_stocks_5yr/individual_stocks_5yr/ACN_data.csv')

# Display the first 5 rows of the DataFrame
print(df.head())

# Display the last 5 rows of the DataFrame
print(df.tail())

# Display the shape of the DataFrame
print('The shape of the DataFrame is:', df.shape)

# Display the column names
print('The column names are:', df.columns)

# Display the data types of each column
print('The data types of each column are:', df.dtypes)

# Check for missing values
print('The number of missing values in each column are:', df.isnull().sum())

# Display summary statistics
print(df.describe())

         date   open    high    low  close   volume Name
0  2013-02-08  73.01  73.710  72.82  73.31  2000477  ACN
1  2013-02-11  73.09  73.270  72.10  73.07  1880055  ACN
2  2013-02-12  72.89  73.495  72.58  73.37  1710274  ACN
3  2013-02-13  73.32  73.710  73.20  73.56  1884631  ACN
4  2013-02-14  73.21  73.500  72.66  73.13  2096346  ACN
            date     open    high     low   close   volume Name
1254  2018-02-01  160.155  161.13  159.54  160.46  1692576  ACN
1255  2018-02-02  159.980  160.27  156.63  156.90  2183611  ACN
1256  2018-02-05  156.120  159.27  150.73  151.83  3733711  ACN
1257  2018-02-06  150.290  155.18  149.43  154.69  4259634  ACN
1258  2018-02-07  154.220  158.93  153.07  155.15  2918659  ACN
The shape of the DataFrame is: (1259, 7)
The column names are: Index(['date', 'open', 'high', 'low', 'close', 'volume', 'Name'], dtype='object')
The data types of each column are: date       object
open      float64
high      float64
low       float64
close     float64
volu

In [23]:
# engineer new features t+1, t+2, t+3, t+4, t+5 which are based on the future values in the target column
# e.g. t+1 is the value of the target column 1 day in the future

# Create a new column for the t+1 value
df['t+1'] = df['close'].shift(1)

# Create a new column for the t+2 value
df['t+2'] = df['close'].shift(2)

# Create a new column for the t+3 value
df['t+3'] = df['close'].shift(3)

# Create a new column for the t+4 value
df['t+4'] = df['close'].shift(4)

# Create a new column for the t+5 value
df['t+5'] = df['close'].shift(5)

# Display the first 5 rows of the DataFrame
print(df.head())

         date   open    high    low  close   volume Name    t+1    t+2    t+3  \
0  2013-02-08  73.01  73.710  72.82  73.31  2000477  ACN    NaN    NaN    NaN   
1  2013-02-11  73.09  73.270  72.10  73.07  1880055  ACN  73.31    NaN    NaN   
2  2013-02-12  72.89  73.495  72.58  73.37  1710274  ACN  73.07  73.31    NaN   
3  2013-02-13  73.32  73.710  73.20  73.56  1884631  ACN  73.37  73.07  73.31   
4  2013-02-14  73.21  73.500  72.66  73.13  2096346  ACN  73.56  73.37  73.07   

     t+4  t+5  
0    NaN  NaN  
1    NaN  NaN  
2    NaN  NaN  
3    NaN  NaN  
4  73.31  NaN  


In [24]:
# pre process
# remove the sttock name column
df = df.drop('Name', axis=1)

#drop the date column
df = df.drop('date', axis=1)

In [25]:
# export df to a csv file
df.to_csv('ACN_data_timeseries_validate_preprocess.csv', index=False)

In [26]:
# NOTE: Standardize the features excluding 'date'
# We are skipping the standardization of the features, as the features are already standardized (they are withing the same range, e.g. high, medium, low etc.)

# drop na
df = df.dropna()

# Split the data into features and target variable
X = df.drop('close', axis=1)
y = df['close']

# Initialize the TimeSeriesSplit object, the time series split ensure that the training set is always before the test set
tscv = TimeSeriesSplit(n_splits=5)

# Initialize the Linear Regression model
model = LinearRegression()

# Initialize a list to store the mean squared errors
mse = []

# root mean square error
rmse = []

# Loop over the time series cross-validation splits
for train_index, test_index in tscv.split(X):
    # Split the data into training and testing sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Train the model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Calculate the mean squared error
    mse.append(mean_squared_error(y_test, y_pred))
    
    # Calculate the root mean squared error
    rmse.append(np.sqrt(mean_squared_error(y_test, y_pred)))  

# Print the mean of the mean squared errors
print('The mean of the mean squared errors is:', np.mean(mse))

# Print the mean of the root mean squared errors
print('The mean of the root mean squared errors is:', np.mean(rmse))

The mean of the mean squared errors is: 0.19721058591497273
The mean of the root mean squared errors is: 0.4370585106025078


In [27]:
# use the model to predict the values for each of the rows and export in a csv file
# Make predictions
y_pred = model.predict(X)

# Create a new column for the predictions
df['predictions'] = y_pred

# Export the DataFrame to a new CSV file
df.to_csv('ACN_data_timeseries_validate_predictions.csv', index=False)
