In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
df = pd.read_csv(r"C:\Users\chira\Downloads\8.nifty_alpha_factors_unclean.csv")

df.head()

Unnamed: 0,Date,Stock,Open,High,Low,Close,Volume
0,2023-01-02,RELIANCE,100.706957,101.009809,99.623462,100.496714,1449590.0
1,2023-01-03,RELIANCE,100.1728,101.521987,99.63118,100.35845,3604438.0
2,2023-01-04,RELIANCE,100.977306,101.684174,100.432942,101.006138,
3,2023-01-05,RELIANCE,102.850805,103.990239,101.42572,102.529168,3145529.0
4,2023-01-06,RELIANCE,102.25428,102.943678,101.126802,102.295015,2325541.0


Inspecting the Raw data

In [3]:
# Checking the Raw data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6500 entries, 0 to 6499
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Date    6500 non-null   object 
 1   Stock   6500 non-null   object 
 2   Open    6500 non-null   float64
 3   High    6500 non-null   float64
 4   Low     6500 non-null   float64
 5   Close   6466 non-null   float64
 6   Volume  6466 non-null   float64
dtypes: float64(5), object(2)
memory usage: 355.6+ KB


In [4]:
 # Checking missing Values

df.isnull().sum()

Date       0
Stock      0
Open       0
High       0
Low        0
Close     34
Volume    34
dtype: int64

In [5]:
# Checking for duplicates

df.duplicated().sum()

7

In [6]:
# Date is still Object type, need to convert it
# 34 Null Close and Volume
# 7 Duplicate Rows


df["Date"] = pd.to_datetime(df["Date"])

df = df.dropna(subset = ["Close", "Volume"])

df = df.drop_duplicates()


print("After Cleaning")
df.info()

After Cleaning
<class 'pandas.core.frame.DataFrame'>
Index: 6425 entries, 0 to 6499
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   Date    6425 non-null   datetime64[ns]
 1   Stock   6425 non-null   object        
 2   Open    6425 non-null   float64       
 3   High    6425 non-null   float64       
 4   Low     6425 non-null   float64       
 5   Close   6425 non-null   float64       
 6   Volume  6425 non-null   float64       
dtypes: datetime64[ns](1), float64(5), object(1)
memory usage: 401.6+ KB


 Momentum Factor (Past Return) 20 days

In [7]:
# Sorting the df by stock and date
df = df.sort_values(by = ["Stock", "Date"])



# Group by each stock and calculate 20-day momentum return

df["Momentum_20d"] = df.groupby("Stock")["Close"].transform(lambda x : x.pct_change(periods = 20))

In [8]:
df[["Date", "Stock", "Close", "Momentum_20d"]].tail()

Unnamed: 0,Date,Stock,Close,Momentum_20d
1948,2025-06-24,TCS,96.050586,0.071867
1949,2025-06-25,TCS,97.431301,0.070576
1950,2025-06-26,TCS,98.277708,0.065232
1951,2025-06-27,TCS,98.786849,0.069065
1952,2025-06-30,TCS,98.918544,0.054544


Volatility Factor (Risk Signal) : Measures how much a stock's price fluctuates, can be used alone or to normalize other factors.

In [9]:
# Calculating Daily return

df["Daily_Return"] = df.groupby("Stock")["Close"].pct_change()

# Calculating Volatility = 20 day rolling STD deviation

df["Volatility_20d"] = df.groupby("Stock")["Daily_Return"].transform(lambda x : x.rolling(window = 20).std())


In [10]:
df[["Date", "Stock", "Close", "Volatility_20d" ]].tail()

Unnamed: 0,Date,Stock,Close,Volatility_20d
1948,2025-06-24,TCS,96.050586,0.01242
1949,2025-06-25,TCS,97.431301,0.01236
1950,2025-06-26,TCS,98.277708,0.01219
1951,2025-06-27,TCS,98.786849,0.01219
1952,2025-06-30,TCS,98.918544,0.011879


We have now Momentum_20d (Trend Signal) and Volatility_20d (Risk Measure). These are the two foundational alpha factors.


Rank Stocks by Momentum

In [11]:
# Picking a recent date fwith full data
latest_date = df["Date"].max()

# Filter for that date
snapshot = df[df["Date"] == latest_date].copy()

# Dropping stocks with missing values.  Some stocks might have NaN (missing) for Momentum_20d (maybe they don't have 20 days of data yet).
snapshot = snapshot.dropna(subset = ["Momentum_20d"])

#Rank by momentum
snapshot = snapshot.sort_values("Momentum_20d", ascending = False)

# View top 5 Stocks
snapshot[["Stock", "Momentum_20d", "Volatility_20d"]].head()
                           

Unnamed: 0,Stock,Momentum_20d,Volatility_20d
6499,KOTAKBANK,0.064552,0.010734
1952,TCS,0.054544,0.011879
5853,ASIANPAINT,0.047227,0.019436
3254,ITC,0.034306,0.008481
4551,SBIN,0.026799,0.006618


In [12]:
df.to_csv("8.nifty_alpha_factors_cleaned.csv", index=False)