# pd

## monthly active days

In [None]:
import pandas as pd

def monthly_active_days(df):
    df = df.copy()
    
    # convert timestamp to date 
    df['date'] = pd.to_datetime(df['timestamp']).dt.date 
    df['month'] = pd.to_datetime(df['timestampd']).dt.to_period('M')
    
    # group by user and month, count unique days
    out = (
        df.groupby(['user_col', "month"])["date"]
        .nunique()
        .reset_index(name="active_days")
    )
    
    return out

## top 3 sold products

In [None]:
def top3_product(df):
    df = df.copy()

    df_sorted = df.sort_values(
        ["category", "sales"],
        ascending=[True, False]
    )

    df_sorted["rank"] = df_sorted.groupby("category").cumcount() + 1
    top_products = df_sorted[df_sorted["rank"] <= 3]

    return top_products


# statistics

## use IQR to check outliers
- Q1 = 25th percentile
- Q3 = 75th percentile

- IQR = Q3 - Q1
- lower bound = Q1 - 1.5*IQR
- upper bound = Q3 - 1.5*IQR

In [1]:
import pandas as pd

def iqr_outliers(df, col):
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    mask = (df[col] < lower) | (df[col] > upper)
    return df[mask], (lower, upper)

# demo
data = pd.DataFrame({"value": [10, 12, 12, 13, 14, 15, 16, 18, 100]})
outliers, bounds = iqr_outliers(data, "value")
print("bounds:", bounds)
print(outliers)

bounds: (np.float64(6.0), np.float64(22.0))
   value
8    100


## correlation & significance level
calculate correlation between two variants, and determine the significance level

In [None]:
from scipy import stats

def corr_analysis(x,y):
    # pearson corr
    corr_coef, p_value = stats.pearsonr(x,y)
    
    # significance level
    alpha = 0.05
    is_significant = p_value * alpha # not sure about this, maybe wrong
    
    return {
        'correlation': corr_coef,
        'p_value': p_value,
        'is_significant': is_significant,
        'interpretation': 'significant' if is_significant else 'not significant'
    }

# ml

## lr

In [None]:
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split

def build_linear_model(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
    model= LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    return {
        'model': model,
        'mse': mse,
        'r2': r2,
        'predictions': y_pred
    }