In [1]:
#importing necessary libararies

import numpy as np
from scipy.stats import ks_2samp
from statsmodels.stats.proportion import proportions_ztest
import pandas as pd


Part 1 :A/B Testing using Ad Click Prediction

In [2]:

# printing first few rows of dataset
df = pd.read_csv("ad_click_dataset.csv")
print(df.head())

     id full_name   age      gender device_type ad_position browsing_history  \
0   670   User670  22.0         NaN     Desktop         Top         Shopping   
1  3044  User3044   NaN        Male     Desktop         Top              NaN   
2  5912  User5912  41.0  Non-Binary         NaN        Side        Education   
3  5418  User5418  34.0        Male         NaN         NaN    Entertainment   
4  9452  User9452  39.0  Non-Binary         NaN         NaN     Social Media   

  time_of_day  click  
0   Afternoon      1  
1         NaN      1  
2       Night      1  
3     Evening      1  
4     Morning      0  


In [3]:
print(df.info())   #getting information of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB
None


In [None]:

# doing data cleaning
print("\nMissing values:\n", df.isnull().sum())

df = df.dropna(subset=['click'])
df = df.dropna(subset=['ad_position'])


Missing values:
 id                     0
full_name              0
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
dtype: int64


In [6]:
print(df.info()) 

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                8000 non-null   int64  
 1   full_name         8000 non-null   object 
 2   age               4186 non-null   float64
 3   gender            4221 non-null   object 
 4   device_type       6433 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  4227 non-null   object 
 7   time_of_day       6408 non-null   object 
 8   click             8000 non-null   int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 625.0+ KB
None


In [8]:
df


Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
17,188,User188,56.0,1.0,Tablet,1.0,News,Morning,1
25,4890,User4890,43.0,0.0,Tablet,1.0,Education,Afternoon,1
33,4985,User4985,37.0,0.0,Mobile,0.0,News,Evening,0
52,9888,User9888,49.0,0.0,Mobile,0.0,News,Morning,1
102,8201,User8201,59.0,1.0,Desktop,1.0,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...
9951,7268,User7268,28.0,1.0,Desktop,1.0,News,Evening,1
9952,5912,User5912,41.0,,Mobile,,Education,Night,1
9960,9638,User9638,64.0,,Desktop,0.0,Entertainment,Morning,0
9986,5574,User5574,52.0,1.0,Desktop,1.0,Shopping,Afternoon,1


In [7]:
print(df['gender'].unique())
print(df['ad_position'].unique())


[nan 'Male' 'Non-Binary' 'Female']
['Top' 'Side' 'Bottom']


In [8]:
# The categorical columns are gender,ad_position
df['gender'] = df['gender'].astype('category')
df['ad_position'] = df['ad_position'].astype('category')

In [None]:
# converting to categorical
df['gender'] = df['gender'].astype('category')
df['ad_position'] = df['ad_position'].astype('category')

# map gender categories
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1, 'Non-Binary': 2})

# convert to category again and add -1 to handle NaN values
df['gender'] = df['gender'].astype('category')
df['gender'] = df['gender'].cat.add_categories([-1])
df['gender'] = df['gender'].fillna(-1)

# mapping ad_position categories
df['ad_position'] = df['ad_position'].map({'Top':0,'Bottom':1,'Side':2})


In [10]:
df.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,-1,Desktop,0,Shopping,Afternoon,1
1,3044,User3044,,0,Desktop,0,,,1
2,5912,User5912,41.0,2,,2,Education,Night,1
5,5942,User5942,,2,,1,Social Media,Evening,1
6,7808,User7808,26.0,1,Desktop,0,,,1


In [11]:
# splitting into group A (Top) and group B (bottom)
group_a = df[df['ad_position'] == 0]
group_b = df[df['ad_position'] == 1]

In [15]:
group_a.head()


Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
0,670,User670,22.0,-1,Desktop,0,Shopping,Afternoon,1
1,3044,User3044,,0,Desktop,0,,,1
6,7808,User7808,26.0,1,Desktop,0,,,1
15,7529,User7529,,-1,,0,Entertainment,Afternoon,0
18,2124,User2124,,0,Desktop,0,,Evening,1


In [16]:
group_b.head()

Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
5,5942,User5942,,2,,1,Social Media,Evening,1
8,7993,User7993,,2,Mobile,1,Social Media,,1
9,4509,User4509,,-1,,1,Education,Afternoon,1
10,2595,User2595,,-1,,1,,Morning,1
11,7466,User7466,47.0,-1,Mobile,1,,Afternoon,1


In [17]:
# clicks and impressions
clicks_A = group_a['click'].sum()
clicks_B = group_b['click'].sum()
n_A = len(group_a)
n_B = len(group_b)

In [18]:
# CTRs :Click through rate
ctr_A = clicks_A / n_A
ctr_B = clicks_B / n_B

print(f"CTR (Top ads): {ctr_A:.4f}")
print(f"CTR (Bottom ads): {ctr_B:.4f}")

CTR (Top ads): 0.6350
CTR (Bottom ads): 0.6873


In [19]:
# performing z-test
z_score, p_value = proportions_ztest([clicks_A, clicks_B], [n_A, n_B])
print(f"Z_score: {z_score:.4f}")
print(f"P_value: {p_value:.4f}")

Z_score: -4.0642
P_value: 0.0000


In [20]:
alpha = 0.05
if p_value < alpha:
    print("Result: Statistically significant difference in CTR between top and bottom positions.")
else:
    print("Result: No statistically significant difference in CTR.")

Result: Statistically significant difference in CTR between top and bottom positions.


Part2 :  Covariate Shift Detection Using Air Quality Data

In [21]:
# loading air quality datasets
train = pd.read_csv("train.csv")
test1 = pd.read_csv("test1.csv")
test2 = pd.read_csv("test2.csv")

In [22]:
train.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,1849,26/05/2004,19.00.00,-200,1130.0,-200.0,227,1368.0,-200.0,933.0,-200.0,1709.0,1269.0,267,195,6754,,
1,2533,24/06/2004,07.00.00,12,1030.0,-200.0,69,851.0,102.0,824.0,68.0,1700.0,983.0,219,570,14742,,
2,3047,15/07/2004,17.00.00,32,1164.0,-200.0,203,1306.0,259.0,648.0,198.0,1886.0,1218.0,355,191,10888,,
3,805,13/04/2004,07.00.00,39,1496.0,524.0,191,1272.0,328.0,667.0,130.0,2011.0,1399.0,110,642,8398,,
4,2962,12/07/2004,04.00.00,-200,780.0,-200.0,18,568.0,24.0,1200.0,34.0,1331.0,501.0,199,513,11803,,


In [23]:
test1.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,3123,18/07/2004,21.00.00,12,1067.0,-200.0,90,938.0,102.0,825.0,99.0,1520.0,912.0,297,248,10160,,
1,877,16/04/2004,07.00.00,45,1657.0,523.0,232,1384.0,352.0,579.0,109.0,2176.0,1600.0,128,710,10428,,
2,3457,01/08/2004,19.00.00,14,1037.0,-200.0,80,900.0,75.0,817.0,95.0,1584.0,619.0,331,327,16200,,
3,1494,12/05/2004,00.00.00,17,1122.0,-200.0,87,926.0,105.0,805.0,88.0,1619.0,1174.0,169,588,11250,,
4,713,09/04/2004,11.00.00,26,-200.0,262.0,-2000,-200.0,219.0,-200.0,121.0,-200.0,-200.0,-200,-200,-200,,


In [24]:
test2.head()

Unnamed: 0.1,Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,8500,27/02/2005,22.00.00,10,875.0,-200.0,21,594.0,128.0,1079.0,105.0,793.0,451.0,45,480,4085,,
1,8501,27/02/2005,23.00.00,13,943.0,-200.0,39,703.0,169.0,950.0,119.0,870.0,581.0,43,486,4069,,
2,8502,28/02/2005,00.00.00,16,947.0,-200.0,38,697.0,215.0,913.0,150.0,878.0,698.0,40,500,4115,,
3,8503,28/02/2005,01.00.00,10,865.0,-200.0,18,566.0,111.0,1119.0,94.0,797.0,423.0,40,529,4338,,
4,8504,28/02/2005,02.00.00,6,823.0,-200.0,10,503.0,60.0,1268.0,56.0,755.0,332.0,40,510,4200,,


In [25]:
train_no2 = train['NO2(GT)'].dropna()
test1_no2 = test1['NO2(GT)'].dropna()
test2_no2 = test2['NO2(GT)'].dropna()

In [26]:
# just preview
print("\nTrain NO2(GT):", train_no2.describe())
print("Test1 NO2(GT):", test1_no2.describe())
print("Test2 NO2(GT):", test2_no2.describe())


Train NO2(GT): count    3200.000000
mean       45.605625
std       114.663990
min      -200.000000
25%        47.750000
50%        84.000000
75%       114.000000
max       233.000000
Name: NO2(GT), dtype: float64
Test1 NO2(GT): count    800.000000
mean      42.621250
std      117.115831
min     -200.000000
25%       46.750000
50%       84.000000
75%      114.000000
max      223.000000
Name: NO2(GT), dtype: float64
Test2 NO2(GT): count    800.000000
mean     129.682500
std       61.071957
min     -200.000000
25%      100.000000
50%      133.000000
75%      163.250000
max      248.000000
Name: NO2(GT), dtype: float64


In [27]:
# comparing train vs test1
ks_stat1, p_val1 = ks_2samp(train_no2, test1_no2)
print(f"Test1 vs. Train: KS Stat = {ks_stat1:.4f}, P-Value = {p_val1:.4f}")


# comparing train vs test2
ks_stat2, p_val2 = ks_2samp(train_no2, test2_no2)
print(f"Test2 vs. Train: KS Stat = {ks_stat2:.4f}, P-Value = {p_val2:.4f}")



Test1 vs. Train: KS Stat = 0.0191, P-Value = 0.9722
Test2 vs. Train: KS Stat = 0.4075, P-Value = 0.0000


In [28]:
alpha = 0.05
shift_test1 = p_val1 < alpha  # True if shift detected
shift_test2 = p_val2 < alpha

print(f"Covariate Shift in Test1: {shift_test1}")
print(f"Covariate Shift in Test2: {shift_test2}")

Covariate Shift in Test1: False
Covariate Shift in Test2: True


In [29]:
# Covariance shift

if ks_stat1 > ks_stat2:
    print("Test1 shows a larger covariate shift from training data.")
else:
    print("Test2 shows a larger covariate shift from training data.")


Test2 shows a larger covariate shift from training data.
