In [2]:
#importing necessary libararies

import numpy as np
from scipy.stats import ks_2samp
from statsmodels.stats.proportion import proportions_ztest
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


Part 1

In [5]:

# printing first few rows of dataset
df = pd.read_csv("ad_click_dataset.csv")
print(df.head())

     id full_name   age      gender device_type ad_position browsing_history  \
0   670   User670  22.0         NaN     Desktop         Top         Shopping   
1  3044  User3044   NaN        Male     Desktop         Top              NaN   
2  5912  User5912  41.0  Non-Binary         NaN        Side        Education   
3  5418  User5418  34.0        Male         NaN         NaN    Entertainment   
4  9452  User9452  39.0  Non-Binary         NaN         NaN     Social Media   

  time_of_day  click  
0   Afternoon      1  
1         NaN      1  
2       Night      1  
3     Evening      1  
4     Morning      0  


In [6]:
print(df.info())   #getting information of the data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                10000 non-null  int64  
 1   full_name         10000 non-null  object 
 2   age               5234 non-null   float64
 3   gender            5307 non-null   object 
 4   device_type       8000 non-null   object 
 5   ad_position       8000 non-null   object 
 6   browsing_history  5218 non-null   object 
 7   time_of_day       8000 non-null   object 
 8   click             10000 non-null  int64  
dtypes: float64(1), int64(2), object(6)
memory usage: 703.3+ KB
None


In [7]:

# doing data cleaning
print("\nMissing values:\n", df.isnull().sum())
df = df.dropna()  # Drop missing rows

# converting into categorical columns
df['gender'] = df['gender'].map({'Male': 0, 'Female': 1})
df['ad_position'] = df['ad_position'].map({'Top': 0, 'Bottom': 1})

 



Missing values:
 id                     0
full_name              0
age                 4766
gender              4693
device_type         2000
ad_position         2000
browsing_history    4782
time_of_day         2000
click                  0
dtype: int64


In [8]:
df


Unnamed: 0,id,full_name,age,gender,device_type,ad_position,browsing_history,time_of_day,click
17,188,User188,56.0,1.0,Tablet,1.0,News,Morning,1
25,4890,User4890,43.0,0.0,Tablet,1.0,Education,Afternoon,1
33,4985,User4985,37.0,0.0,Mobile,0.0,News,Evening,0
52,9888,User9888,49.0,0.0,Mobile,0.0,News,Morning,1
102,8201,User8201,59.0,1.0,Desktop,1.0,Social Media,Morning,0
...,...,...,...,...,...,...,...,...,...
9951,7268,User7268,28.0,1.0,Desktop,1.0,News,Evening,1
9952,5912,User5912,41.0,,Mobile,,Education,Night,1
9960,9638,User9638,64.0,,Desktop,0.0,Entertainment,Morning,0
9986,5574,User5574,52.0,1.0,Desktop,1.0,Shopping,Afternoon,1


In [9]:
# splitting into group A (Top) and group B (bottom)
group_a = df[df['ad_position'] == 0]
group_b = df[df['ad_position'] == 1]

In [10]:
# clicks and impressions
clicks_A = group_a['click'].sum()
clicks_B = group_b['click'].sum()
n_A = len(group_a)
n_B = len(group_b)

In [11]:
# CTRs :Click through rate
ctr_A = clicks_A / n_A
ctr_B = clicks_B / n_B

print(f"CTR (Top ads): {ctr_A:.4f}")
print(f"CTR (Bottom ads): {ctr_B:.4f}")

CTR (Top ads): 0.6327
CTR (Bottom ads): 0.6784


In [12]:
# performing z-test
z_score, p_value = proportions_ztest([clicks_A, clicks_B], [n_A, n_B])
print(f"Z_score: {z_score:.4f}")
print(f"P_value: {p_value:.4f}")

Z_score: -1.1365
P_value: 0.2557


In [13]:
alpha = 0.05
if p_value < alpha:
    print("Result: Statistically significant difference in CTR between top and bottom positions.")
else:
    print("Result: No statistically significant difference in CTR.")

Result: No statistically significant difference in CTR.


Part2 

In [14]:
# loading air quality datasets
train = pd.read_csv("train.csv")
test1 = pd.read_csv("test1.csv")
test2 = pd.read_csv("test2.csv")

In [15]:
train_no2 = train['NO2(GT)'].dropna()
test1_no2 = test1['NO2(GT)'].dropna()
test2_no2 = test2['NO2(GT)'].dropna()

In [16]:
# just preview
print("\nTrain NO2(GT):", train_no2.describe())
print("Test1 NO2(GT):", test1_no2.describe())
print("Test2 NO2(GT):", test2_no2.describe())


Train NO2(GT): count    3200.000000
mean       45.605625
std       114.663990
min      -200.000000
25%        47.750000
50%        84.000000
75%       114.000000
max       233.000000
Name: NO2(GT), dtype: float64
Test1 NO2(GT): count    800.000000
mean      42.621250
std      117.115831
min     -200.000000
25%       46.750000
50%       84.000000
75%      114.000000
max      223.000000
Name: NO2(GT), dtype: float64
Test2 NO2(GT): count    800.000000
mean     129.682500
std       61.071957
min     -200.000000
25%      100.000000
50%      133.000000
75%      163.250000
max      248.000000
Name: NO2(GT), dtype: float64


In [17]:
# comparing train vs test1
ks_stat1, p_val1 = ks_2samp(train_no2, test1_no2)
print(f"Test1 vs. Train: KS Stat = {ks_stat1:.4f}, P-Value = {p_val1:.4f}")


# comparing train vs test2
ks_stat2, p_val2 = ks_2samp(train_no2, test2_no2)
print(f"Test2 vs. Train: KS Stat = {ks_stat2:.4f}, P-Value = {p_val2:.4f}")



Test1 vs. Train: KS Stat = 0.0191, P-Value = 0.9722
Test2 vs. Train: KS Stat = 0.4075, P-Value = 0.0000


In [18]:
alpha = 0.05
shift_test1 = p_val1 < alpha  # True if shift detected
shift_test2 = p_val2 < alpha

print(f"Covariate Shift in Test1: {shift_test1}")
print(f"Covariate Shift in Test2: {shift_test2}")

Covariate Shift in Test1: False
Covariate Shift in Test2: True


In [19]:
# Covariance shift

if ks_stat1 > ks_stat2:
    print("Test1 shows a larger covariate shift from training data.")
else:
    print("Test2 shows a larger covariate shift from training data.")


Test2 shows a larger covariate shift from training data.
