# Relax Data Science Challenge

## 1. Defining an "adopted user"

An "adopted user" is defined as a user who has logged into the product on three separate days in at least one seven-day period.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the datasets
users_df = pd.read_csv("/home/ubuntu/upload/takehome_users.csv", encoding="latin-1")
engagement_df = pd.read_csv("/home/ubuntu/upload/takehome_user_engagement.csv")

# Convert to datetime
engagement_df["time_stamp"] = pd.to_datetime(engagement_df["time_stamp"])
users_df["creation_time"] = pd.to_datetime(users_df["creation_time"])
users_df["last_session_creation_time"] = pd.to_datetime(users_df["last_session_creation_time"], unit="s")

# Define adopted users
engagement_df = engagement_df.sort_values(["user_id", "time_stamp"])
def is_adopted_user(user_engagement):
    user_engagement = user_engagement.set_index("time_stamp").resample("D").count()
    for i in range(len(user_engagement) - 2):
        if user_engagement.iloc[i:i+7]["visited"].sum() >= 3:
            return 1
    return 0

adopted_users = engagement_df.groupby("user_id").apply(is_adopted_user)
adopted_users.name = "adopted_user"

# Merge with users dataframe
users_df = users_df.merge(adopted_users, left_on="object_id", right_index=True, how="left")
users_df["adopted_user"] = users_df["adopted_user"].fillna(0)

## 2. Data Exploration and Preprocessing

We will perform some initial data exploration and preprocess the data for modeling.

In [None]:
print("
Users dataframe info:")
users_df.info()
print("
Users dataframe head:")
print(users_df.head())

print("
Engagement dataframe info:")
engagement_df.info()
print("
Engagement dataframe head:")
print(engagement_df.head())

print("
Adopted user counts:")
print(users_df["adopted_user"].value_counts())

## 3. Feature Engineering

We will create new features that might be predictive of user adoption.

In [None]:
users_df["account_age"] = (users_df["last_session_creation_time"] - users_df["creation_time"]).dt.days
users_df["account_age"] = users_df["account_age"].fillna(0)
users_df["was_invited"] = users_df["invited_by_user_id"].apply(lambda x: 1 if pd.notnull(x) else 0)

# One-hot encode categorical features
users_df = pd.get_dummies(users_df, columns=["creation_source"], prefix="creation_source")

# Select features and target
features = [
    "opted_in_to_mailing_list",
    "enabled_for_marketing_drip",
    "account_age",
    "was_invited",
    "creation_source_GUEST_INVITE",
    "creation_source_ORG_INVITE",
    "creation_source_PERSONAL_PROJECTS",
    "creation_source_SIGNUP",
    "creation_source_SIGNUP_GOOGLE_AUTH",
]
target = "adopted_user"

X = users_df[features]
y = users_df[target]

## 4. Model Training and Evaluation

We will train a RandomForestClassifier to predict user adoption and evaluate its performance.

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train a RandomForestClassifier
clf = RandomForestClassifier(random_state=42, class_weight="balanced")
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## 5. Feature Importance

We will examine the importance of each feature in predicting user adoption.

In [None]:
# Feature importances
feature_importances = pd.DataFrame(clf.feature_importances_, index=X_train.columns, columns=["importance"]).sort_values("importance", ascending=False)
print("\nFeature Importances:")
print(feature_importances)

# Visualize feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x=feature_importances["importance"], y=feature_importances.index)
plt.title("Feature Importances for Adopted User Prediction")
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.tight_layout()
plt.savefig("/home/ubuntu/feature_importances.png")
plt.show()