In [4]:
import pandas as pd
from sklearn import linear_model
import matplotlib.pyplot as plt

# Lets read the csv file and look at the data
norw_data = pd.read_csv('NAS.csv')
norw_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2003-12-18,19.482599,19.596901,19.025499,19.139799,19.139799,4978496.0
1,2003-12-19,19.368299,19.425501,18.282801,18.454201,18.454201,1410901.0
2,2003-12-22,18.739901,18.739901,17.997101,18.0543,18.0543,137047.0
3,2003-12-23,17.997101,17.997101,17.3687,17.4258,17.4258,229418.0
4,2003-12-24,,,,,,


In [7]:
# We see that there are several NaN values
# Lets take a closer look
norw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4253 entries, 0 to 4252
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Date       4253 non-null   object 
 1   Open       4218 non-null   float64
 2   High       4218 non-null   float64
 3   Low        4218 non-null   float64
 4   Close      4218 non-null   float64
 5   Adj Close  4218 non-null   float64
 6   Volume     4218 non-null   float64
dtypes: float64(6), object(1)
memory usage: 232.7+ KB


In [9]:
# Lets look at the percentage of missing values in the dataset
norw_data.isnull().sum()/len(norw_data)*100

# The output shows us that all of the columns except the first one
# has some missing values (< 1%)
# This is insignificant, I think. We keep all the values

Date         0.000000
Open         0.822949
High         0.822949
Low          0.822949
Close        0.822949
Adj Close    0.822949
Volume       0.822949
dtype: float64

In [17]:
# Lets extract only the relevant columns
# We're gonna work with the dates and the close values

norw_data = norw_data.drop(['Open', 'High', 'Low', 'Adj Close', 'Volume'], axis=1)

In [18]:
# Let's take a look at the data now
norw_data.head()

Unnamed: 0,Date,Close
0,2003-12-18,19.139799
1,2003-12-19,18.454201
2,2003-12-22,18.0543
3,2003-12-23,17.4258
4,2003-12-24,


In [20]:
# Awesome, now let's get to work
# We want to split the data into independent and dependent variables.
# Date, which doesn't change, is our independent variable
# While 'Close', the output, will be the dependent variable.

# Save dates to X and close values to Y
X = norw_data.iloc[:, :-1].values #date
Y = norw_data.iloc[:, -1].values #close

In [53]:
from sklearn.model_selection import train_test_split

# Splitting the data into random train and test sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)

In [55]:
# This reshapes the Y test and train sets to be vertical instead of horisontal (?)
Y_train = Y_train.reshape(-1, 1)
Y_test = Y_test.reshape(-1, 1)