## Importing Libraries

In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import yfinance as yf




### NIFTY 50 INDEX

#####Importing symbols(tickers) from Wikipedia using pd_html
#####Using tickers for importing data from Yahoo Finance ie NIFTY 50 Indexes


In [2]:
tickers = pd.read_html('https://en.wikipedia.org/wiki/NIFTY_50')[2]['Symbol']
print(tickers)

0       ADANIENT
1     ADANIPORTS
2     APOLLOHOSP
3     ASIANPAINT
4       AXISBANK
5     BAJAJ-AUTO
6     BAJFINANCE
7     BAJAJFINSV
8           BPCL
9     BHARTIARTL
10     BRITANNIA
11         CIPLA
12     COALINDIA
13      DIVISLAB
14       DRREDDY
15     EICHERMOT
16        GRASIM
17       HCLTECH
18      HDFCBANK
19      HDFCLIFE
20    HEROMOTOCO
21      HINDALCO
22    HINDUNILVR
23     ICICIBANK
24    INDUSINDBK
25          INFY
26           ITC
27      JSWSTEEL
28     KOTAKBANK
29            LT
30          LTIM
31           M&M
32        MARUTI
33     NESTLEIND
34          NTPC
35          ONGC
36     POWERGRID
37      RELIANCE
38       SBILIFE
39          SBIN
40     SUNPHARMA
41    TATAMOTORS
42     TATASTEEL
43           TCS
44    TATACONSUM
45         TECHM
46         TITAN
47    ULTRACEMCO
48           UPL
49         WIPRO
Name: Symbol, dtype: object


## Data Preparation (Iterating using For Loop for all 50 Tickers)
#### Importing data from Yahoo Finance using yf library
#### Setting end date as present (latest) and start date as present - last 15 years using timedelta function from datetime library
#### Setting interval as 1 day, pre-opening,post-closing as false
#### Prev_Close(New column) = Shifting 1 row of Close below
#### Prev_Volume(New column) = Shifting 1 row of Volume below

Why not considering Volume?
Volume is associated with previous day close fundamentally



In [3]:
from datetime import datetime
for ticker in tickers[0:1]:
  end_date = datetime.now()
  start_date = end_date - timedelta(days=15*365)

  history = yf.download(ticker+".NS", start_date, end=end_date , interval='1d' , prepost=False)
  history = history.loc[: , ['Open', 'Close', 'Volume']]
  history['Prev_Close'] = history.loc[: , 'Close'].shift(1)
  history['Prev_Volume'] = history.loc[: , 'Volume'].shift(1)

  display(history)

[*********************100%***********************]  1 of 1 completed


Unnamed: 0_level_0,Open,Close,Volume,Prev_Close,Prev_Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2008-07-31,49.048019,48.837265,293407,,
2008-08-01,48.523052,50.017483,581190,48.837265,293407.0
2008-08-04,49.814396,49.661118,150187,50.017483,581190.0
2008-08-05,49.124657,50.542450,142802,49.661118,150187.0
2008-08-06,45.446053,49.538498,379853,50.542450,142802.0
...,...,...,...,...,...
2023-07-21,2410.000000,2416.300049,2280563,2419.750000,1564275.0
2023-07-24,2430.000000,2418.199951,1113193,2416.300049,2280563.0
2023-07-25,2428.000000,2466.649902,3017555,2418.199951,1113193.0
2023-07-26,2472.449951,2470.750000,4021773,2466.649902,3017555.0


## Changing Date time
### Converting each datetime object in the "datetimes" list to a pandas Timestamp object, and then use the weekday() method to get the integer representation of the weekday (0 for Monday, 1 for Tuesday, ..., 6 for Sunday). The resulting weekdays are stored in the "weekdays" list.

In [4]:
datetimes = history.index.values
weekdays = []


# Converting datetime into string
for dt in datetimes:
  #print(dt)
  dt = datetime.strptime(str(dt) ,'%Y-%m-%dT%H:%M:%S.000000000')
  weekdays.append(dt.weekday())

 # Adding new column Weekday for assigning week days number
history['Weekday'] = weekdays

display(history)

Unnamed: 0_level_0,Open,Close,Volume,Prev_Close,Prev_Volume,Weekday
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2008-07-31,49.048019,48.837265,293407,,,3
2008-08-01,48.523052,50.017483,581190,48.837265,293407.0,4
2008-08-04,49.814396,49.661118,150187,50.017483,581190.0,0
2008-08-05,49.124657,50.542450,142802,49.661118,150187.0,1
2008-08-06,45.446053,49.538498,379853,50.542450,142802.0,2
...,...,...,...,...,...,...
2023-07-21,2410.000000,2416.300049,2280563,2419.750000,1564275.0,4
2023-07-24,2430.000000,2418.199951,1113193,2416.300049,2280563.0,0
2023-07-25,2428.000000,2466.649902,3017555,2418.199951,1113193.0,1
2023-07-26,2472.449951,2470.750000,4021773,2466.649902,3017555.0,2


## Dropping rows NaN values
### Converting Infinite values ie number divided by 0 to NaN : by using replace(np.inf,np.nan)
### Dropping these NaN values: using drop.na()


In [10]:
history= history.replace(np.inf , np.nan).dropna()

## Machine Learning
### Using Linear Regression to predict Close price
### Data: Train, Test sampling split should be sequentially because it is a Time-Series Data a

In [85]:
from sklearn.linear_model import LinearRegression

y = history['Close']
x = history.drop(['Close' , 'Volume'], axis = 1)

# variable num_test = 365 to determine how many data points
# will be used for testing the model's performance

num_test = 365
# Train data: All rows except last 365 days
# Test data: All columns starting from last 365 days

x_train = x[:-1*num_test]
y_train = y[:-1*num_test]
x_test = x[-1*num_test:]
y_test = y[-1*num_test:]

#(Extracting number of Samples and number of Features)
print(x_train.shape , y_train.shape , x_test.shape, y_test.shape)

(3328, 4) (3328,) (365, 4) (365,)


In [86]:
model = LinearRegression()
model = model.fit(x_train , y_train)
preds = model.predict(x_test)
print(type(preds))
print(type(y_test))

print(ticker)

print(y_test)
print(preds)

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
ADANIENT
Date
2022-02-07    1731.800049
2022-02-08    1716.000000
2022-02-09    1783.150024
2022-02-10    1786.099976
2022-02-11    1764.300049
                 ...     
2023-07-21    2416.300049
2023-07-24    2418.199951
2023-07-25    2466.649902
2023-07-26    2470.750000
2023-07-27    2428.399902
Name: Close, Length: 365, dtype: float64
[1763.46710756 1737.75070648 1722.16429785 1792.11670242 1780.01604079
 1756.43946172 1655.80341129 1747.05564329 1745.44879592 1736.65096262
 1740.39062378 1668.78551274 1676.0273081  1666.67090613 1565.8369166
 1615.74028705 1644.58520772 1655.42416384 1637.55803662 1608.09363177
 1571.01526156 1607.36329832 1669.10857644 1748.18278865 1736.36896836
 1740.60808381 1713.05485591 1767.62815886 1829.28693802 1806.01224012
 1841.00141512 1815.35391485 1836.56852727 1874.1787801  1916.6499894
 1924.42961427 2002.08954891 2018.93518978 2049.69482673 2074.0649893
 2148.62589936 2165.61238791 2108.

In [87]:
len(y_test)
len(preds)

AttributeError: ignored

In [88]:
print(y_test)

Date
2022-02-07    1731.800049
2022-02-08    1716.000000
2022-02-09    1783.150024
2022-02-10    1786.099976
2022-02-11    1764.300049
                 ...     
2023-07-21    2416.300049
2023-07-24    2418.199951
2023-07-25    2466.649902
2023-07-26    2470.750000
2023-07-27    2428.399902
Name: Close, Length: 365, dtype: float64


In [93]:
#print(preds)
print(type(preds))
#len(preds)
print(preds.shape)

<class 'numpy.ndarray'>
(365,)


AttributeError: ignored