In [1]:
import numpy as np 
import pandas as pd

In [2]:
BASE_PATH = r"D:\DATA_ANALYST\FULL_STACK_FROJECT\Stock Market Prediction\stock_market_unclean_dataset"
df = pd.read_csv(BASE_PATH + r"\raw_data\global_indices.csv")

In [3]:
df.head()

Unnamed: 0,date,index,value
0,2015-01-01,SENSEX,21722.15
1,2015-01-02,SENSEX,21692.84
2,2015-01-05,SENSEX,21923.69
3,2015-01-06,SENSEX,22013.51
4,2015-01-07,SENSEX,22076.64


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11568 entries, 0 to 11567
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    11568 non-null  object 
 1   index   11568 non-null  object 
 2   value   11568 non-null  float64
dtypes: float64(1), object(2)
memory usage: 271.3+ KB


In [5]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.dropna(subset=['date'], inplace=True)

In [6]:
df['index'] = df['index'].str.strip().str.upper()

In [7]:
df.sort_values(by=['index', 'date'], inplace=True)

In [8]:
df.drop_duplicates(subset=['index', 'date'], inplace=True)

In [9]:
df = df[df['value'] > 0]

In [10]:
df = df[df['value'] < df['value'].quantile(0.999)]

In [11]:
df.reset_index(drop=True, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11556 entries, 0 to 11555
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    11556 non-null  datetime64[ns]
 1   index   11556 non-null  object        
 2   value   11556 non-null  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 271.0+ KB


In [13]:
df.head()

Unnamed: 0,date,index,value
0,2015-01-01,DOWJONES,16398.7
1,2015-01-02,DOWJONES,16288.48
2,2015-01-05,DOWJONES,16113.54
3,2015-01-06,DOWJONES,16168.26
4,2015-01-07,DOWJONES,16260.5


In [14]:
df['index_return'] = df.groupby('index')['value'].pct_change()

In [15]:
df['volatility_7d'] = (df.groupby('index')['index_return'].rolling(7).std().reset_index(level=0, drop=True))

In [16]:
df.head(10)

Unnamed: 0,date,index,value,index_return,volatility_7d
0,2015-01-01,DOWJONES,16398.7,,
1,2015-01-02,DOWJONES,16288.48,-0.006721,
2,2015-01-05,DOWJONES,16113.54,-0.01074,
3,2015-01-06,DOWJONES,16168.26,0.003396,
4,2015-01-07,DOWJONES,16260.5,0.005705,
5,2015-01-08,DOWJONES,16159.23,-0.006228,
6,2015-01-09,DOWJONES,15906.91,-0.015615,
7,2015-01-12,DOWJONES,15938.43,0.001982,0.00793
8,2015-01-13,DOWJONES,15809.36,-0.008098,0.008025
9,2015-01-14,DOWJONES,15884.0,0.004721,0.008062


In [17]:
df.dropna(inplace=True)

In [18]:
df.head(10)

Unnamed: 0,date,index,value,index_return,volatility_7d
7,2015-01-12,DOWJONES,15938.43,0.001982,0.00793
8,2015-01-13,DOWJONES,15809.36,-0.008098,0.008025
9,2015-01-14,DOWJONES,15884.0,0.004721,0.008062
10,2015-01-15,DOWJONES,15680.52,-0.01281,0.008559
11,2015-01-16,DOWJONES,15824.33,0.009171,0.009304
12,2015-01-19,DOWJONES,16066.16,0.015282,0.011642
13,2015-01-20,DOWJONES,16136.31,0.004366,0.009679
14,2015-01-21,DOWJONES,16235.32,0.006136,0.009798
15,2015-01-22,DOWJONES,16386.74,0.009327,0.008762
16,2015-01-23,DOWJONES,16390.03,0.000201,0.008965


In [19]:
df['index'].value_counts()

index
DOWJONES    2885
NASDAQ      2885
NIFTY50     2885
SENSEX      2873
Name: count, dtype: int64

FEATURE ENGINEERING

In [20]:
df = df.sort_values(['index', 'date'])

In [21]:
df['daily_return'] = df.groupby('index')['value'].pct_change()

In [22]:
df['return_7d']  = df.groupby('index')['daily_return'].rolling(7).mean().reset_index(level=0, drop=True)
df['return_21d'] = df.groupby('index')['daily_return'].rolling(21).mean().reset_index(level=0, drop=True)

In [23]:
df['volatility_7d'] = df.groupby('index')['daily_return'].rolling(7).std().reset_index(level=0, drop=True)
df['volatility_21d'] = df.groupby('index')['daily_return'].rolling(21).std().reset_index(level=0, drop=True)

In [24]:
df['ma_7']  = df.groupby('index')['value'].rolling(7).mean().reset_index(level=0, drop=True)
df['ma_21'] = df.groupby('index')['value'].rolling(21).mean().reset_index(level=0, drop=True)

In [25]:
df['trend_flag'] = (df['ma_7'] > df['ma_21']).astype(int)

In [26]:
df['momentum'] = df['return_7d'] - df['return_21d']

In [27]:
df['rolling_max'] = df.groupby('index')['value'].cummax()
df['drawdown'] = (df['value'] - df['rolling_max']) / df['rolling_max']

In [28]:
df['volatility_21d'].describe()


count    11444.000000
mean         0.009275
std          0.006451
min          0.001528
25%          0.004693
50%          0.006762
75%          0.011868
max          0.046392
Name: volatility_21d, dtype: float64

In [29]:
low = df['volatility_21d'].quantile(0.20)
high = df['volatility_21d'].quantile(0.70) 
df['Market_Volatility_Level'] = pd.cut(
    df['volatility_21d'],
    bins=[0, low, high, df['volatility_21d'].max()],
    labels=['Low Volatility', 'Medium Volatility', 'High Volatility']
)

In [30]:
df['Market_Volatility_Level'].value_counts()

Market_Volatility_Level
Medium Volatility    5722
High Volatility      3433
Low Volatility       2289
Name: count, dtype: int64

In [31]:
df.head()

Unnamed: 0,date,index,value,index_return,volatility_7d,daily_return,return_7d,return_21d,volatility_21d,ma_7,ma_21,trend_flag,momentum,rolling_max,drawdown,Market_Volatility_Level
7,2015-01-12,DOWJONES,15938.43,0.001982,,,,,,,,0,,15938.43,0.0,
8,2015-01-13,DOWJONES,15809.36,-0.008098,,-0.008098,,,,,,0,,15938.43,-0.008098,
9,2015-01-14,DOWJONES,15884.0,0.004721,,0.004721,,,,,,0,,15938.43,-0.003415,
10,2015-01-15,DOWJONES,15680.52,-0.01281,,-0.01281,,,,,,0,,15938.43,-0.016182,
11,2015-01-16,DOWJONES,15824.33,0.009171,,0.009171,,,,,,0,,15938.43,-0.007159,


In [32]:
df = df.dropna(subset=['return_21d', 'volatility_21d', 'ma_21'])

In [33]:
df.head()

Unnamed: 0,date,index,value,index_return,volatility_7d,daily_return,return_7d,return_21d,volatility_21d,ma_7,ma_21,trend_flag,momentum,rolling_max,drawdown,Market_Volatility_Level
28,2015-02-10,DOWJONES,16528.48,0.000923,0.004951,0.000923,0.002816,0.001757,0.007144,16461.085714,16240.683333,1,0.001059,16528.48,0.0,Medium Volatility
29,2015-02-11,DOWJONES,16518.65,-0.000595,0.003511,-0.000595,0.00117,0.002114,0.006806,16480.237143,16274.459048,1,-0.000944,16528.48,-0.000595,Medium Volatility
30,2015-02-12,DOWJONES,16295.62,-0.013502,0.006551,-0.013502,-0.000957,0.001246,0.007576,16464.272857,16294.06,1,-0.002204,16528.48,-0.014088,Medium Volatility
31,2015-02-13,DOWJONES,16291.05,-0.00028,0.00646,-0.00028,-0.001239,0.001843,0.006874,16443.687143,16323.132857,1,-0.003082,16528.48,-0.014365,Medium Volatility
32,2015-02-16,DOWJONES,16262.38,-0.00176,0.005781,-0.00176,-0.00225,0.001322,0.006703,16406.52,16343.992381,1,-0.003573,16528.48,-0.016099,Medium Volatility


In [34]:
df = df.rename(columns={'value': 'index_price',
                        'index_return': 'daily_return_pct', 
                        'daily_return': 'smoothed_daily_return_pct', 
                        'return_7d': 'avg_return_7d', 
                        'return_21d': 'avg_return_21d', 
                        'volatility_7d': 'volatility_7d_pct', 
                        'volatility_21d': 'volatility_21d_pct', 
                        'ma_7': 'moving_avg_7d', 
                        'ma_21': 'moving_avg_21d', 
                        'rolling_max': 'all_time_high_till_date', 
                        'drawdown': 'drawdown_pct'})


In [35]:
df.head()

Unnamed: 0,date,index,index_price,daily_return_pct,volatility_7d_pct,smoothed_daily_return_pct,avg_return_7d,avg_return_21d,volatility_21d_pct,moving_avg_7d,moving_avg_21d,trend_flag,momentum,all_time_high_till_date,drawdown_pct,Market_Volatility_Level
28,2015-02-10,DOWJONES,16528.48,0.000923,0.004951,0.000923,0.002816,0.001757,0.007144,16461.085714,16240.683333,1,0.001059,16528.48,0.0,Medium Volatility
29,2015-02-11,DOWJONES,16518.65,-0.000595,0.003511,-0.000595,0.00117,0.002114,0.006806,16480.237143,16274.459048,1,-0.000944,16528.48,-0.000595,Medium Volatility
30,2015-02-12,DOWJONES,16295.62,-0.013502,0.006551,-0.013502,-0.000957,0.001246,0.007576,16464.272857,16294.06,1,-0.002204,16528.48,-0.014088,Medium Volatility
31,2015-02-13,DOWJONES,16291.05,-0.00028,0.00646,-0.00028,-0.001239,0.001843,0.006874,16443.687143,16323.132857,1,-0.003082,16528.48,-0.014365,Medium Volatility
32,2015-02-16,DOWJONES,16262.38,-0.00176,0.005781,-0.00176,-0.00225,0.001322,0.006703,16406.52,16343.992381,1,-0.003573,16528.48,-0.016099,Medium Volatility


In [36]:
df = df.rename(columns={
    'date': 'trade_date',
    'index': 'index_name',
    'daily_return_pct': 'index_daily_return_pct',
    'momentum': 'return_momentum',})

df = df[['index_name','trade_date','index_price','index_daily_return_pct','avg_return_7d',
         'avg_return_21d','volatility_7d_pct','volatility_21d_pct','moving_avg_7d','moving_avg_21d',
         'trend_flag','return_momentum','all_time_high_till_date','drawdown_pct','Market_Volatility_Level']]

In [37]:
df.to_csv("global_indices.csv", index=False)

In [38]:
df.shape

(11444, 15)