In [2]:
import pandas as pd

df = pd.read_csv("../data/website_wata.csv")

df_clean = df[df["Page Views"] > 0].copy()
df_clean.shape


(1986, 7)

In [3]:
df_clean["Traffic Source"].value_counts()


Traffic Source
Organic     783
Paid        423
Referral    298
Social      267
Direct      215
Name: count, dtype: int64

* Ver si la duración de la sesión cambia según la fuente
y distinguir entre lo que parece “mejor tráfico” y lo que solo es “más tráfico”.

In [4]:
df_clean.groupby("Traffic Source")["Session Duration"].agg(["count", "mean", "median"])


Unnamed: 0_level_0,count,mean,median
Traffic Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Direct,215,2.696773,1.887698
Organic,783,3.106891,2.089677
Paid,423,2.949216,1.768707
Referral,298,3.140413,1.988206
Social,267,3.072469,2.272383


In [5]:
df_pv = df_clean.groupby("Page Views")["Session Duration"].agg(["count", "mean", "median"])
df_pv


Unnamed: 0_level_0,count,mean,median
Page Views,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,69,2.893222,2.179118
2,170,3.217268,1.993427
3,281,3.185502,2.06296
4,354,3.068284,2.094338
5,354,2.777708,1.900495
6,296,3.271741,2.163261
7,211,2.989747,2.154071
8,135,2.649997,1.754873
9,61,2.646809,1.969349
10,34,3.434991,1.881509


In [6]:
df_pv.count()

count     14
mean      14
median    14
dtype: int64

In [7]:
df_source = (
    df_clean
    .groupby("Traffic Source")
    .agg(
        sessions=("Session Duration", "count"),
        mean_duration=("Session Duration", "mean"),
        median_duration=("Session Duration", "median"),
        mean_pageviews=("Page Views", "mean"),
        median_pageviews=("Page Views", "median"),
        mean_conversion=("Conversion Rate", "mean"),
        median_conversion=("Conversion Rate", "median"),
    )
)
df_source

Unnamed: 0_level_0,sessions,mean_duration,median_duration,mean_pageviews,median_pageviews,mean_conversion,median_conversion
Traffic Source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Direct,215,2.696773,1.887698,4.986047,5.0,0.97858,1.0
Organic,783,3.106891,2.089677,5.0447,5.0,0.982401,1.0
Paid,423,2.949216,1.768707,5.002364,5.0,0.980853,1.0
Referral,298,3.140413,1.988206,5.033557,5.0,0.987829,1.0
Social,267,3.072469,2.272383,4.730337,5.0,0.982833,1.0


In [8]:
df_source[["mean_conversion", "median_conversion"]]


Unnamed: 0_level_0,mean_conversion,median_conversion
Traffic Source,Unnamed: 1_level_1,Unnamed: 2_level_1
Direct,0.97858,1.0
Organic,0.982401,1.0
Paid,0.980853,1.0
Referral,0.987829,1.0
Social,0.982833,1.0


## Previews Visits Observations

In [9]:
df_clean["Previous Visits"].describe()

count    1986.000000
mean        1.974824
std         1.433451
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         9.000000
Name: Previous Visits, dtype: float64

In [10]:
df_clean["Previous Visits"].value_counts().sort_index()

Previous Visits
0    282
1    536
2    558
3    337
4    160
5     76
6     24
7     10
8      2
9      1
Name: count, dtype: int64

In [11]:
# segmentation previews visits:
df_seg = df_clean.copy()

df_seg["PrevVisitsGroup"] = pd.cut(
    df_seg["Previous Visits"],
    bins=[-0.1, 0, 3, 9],
    labels=["0", "1-3", "4-9"]
)

df_rec = df_seg.groupby("PrevVisitsGroup", observed=True).agg(
    sessions=("Session Duration", "count"),
    median_duration=("Session Duration", "median"),
    median_pageviews=("Page Views", "median"),
    mean_conversion=("Conversion Rate", "mean"),
)
df_rec


Unnamed: 0_level_0,sessions,median_duration,median_pageviews,mean_conversion
PrevVisitsGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,282,2.15542,5.0,0.96754
1-3,1431,1.983088,5.0,0.983895
4-9,273,1.96785,5.0,0.990861


## Do longer sessions convert better?

In [12]:
df_clean["Session Duration"].quantile([0.25, 0.5, 0.75])


0.25    0.814566
0.50    2.006107
0.75    4.209668
Name: Session Duration, dtype: float64

In [16]:
df_dur = df_clean.copy()

df_dur["DurationGroup"] = pd.cut(
    df_dur["Session Duration"],
    bins=[0, 0.81, 2.01, 4.21, df_dur["Session Duration"].max()],
    labels=["very_short", "short_mid", "mid_long", "long"]
)

df_conv_by_dur = df_dur.groupby("DurationGroup", observed=True).agg(
    sessions=("Session Duration", "count"),
    median_duration=("Session Duration", "median"),
    mean_conversion=("Conversion Rate", "mean"),
    median_conversion=("Conversion Rate", "median"),
)
df_conv_by_dur


Unnamed: 0_level_0,sessions,median_duration,mean_conversion,median_conversion
DurationGroup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
very_short,491,0.392856,0.962108,1.0
short_mid,505,1.322762,0.977315,1.0
mid_long,493,2.863023,0.991325,1.0
long,497,6.340482,0.999281,1.0
