In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
movies_data=pd.read_csv("Movie_modified.csv")
rating_data=pd.read_csv("rating_modified.csv")
df=pd.read_csv("subscription_modified.csv")
users=pd.read_csv("user_modified.csv")
watch_history=pd.read_csv("watch_history_modified.csv")

In [3]:
df.head(20)

Unnamed: 0,payment_id,user_id,amount,payment_date,plan_type,valid_until,churn_status,renewed,validity_days
0,T67951961023,15000,12.99,2024-11-01,Premium,2024-12-31,active,True,60
1,T68965430008,15000,12.99,2025-01-05,Premium,2025-03-06,active,True,60
2,T56296958896,15000,7.99,2025-03-08,Standard,2025-04-07,active,True,30
3,T77025305054,15001,7.99,2024-11-01,Standard,2024-12-01,active,True,30
4,T73938493845,15001,7.99,2024-12-04,Standard,2025-01-03,churned,False,30
5,T80595234029,15002,12.99,2024-11-01,Premium,2024-12-31,active,True,60
6,T73954774255,15002,12.99,2025-01-01,Premium,2025-03-02,active,True,60
7,T61837475327,15002,7.99,2025-03-04,Standard,2025-04-03,active,True,30
8,T71978840752,15003,12.99,2024-11-01,Premium,2024-12-31,active,True,60
9,T71946924698,15003,12.99,2025-01-01,Premium,2025-03-02,active,True,60


In [4]:
df.columns

Index(['payment_id', 'user_id', 'amount', 'payment_date', 'plan_type',
       'valid_until', 'churn_status', 'renewed', 'validity_days'],
      dtype='object')

In [5]:
df['renewed'].value_counts()

renewed
True     3483
False     805
Name: count, dtype: int64

In [6]:
import pandas as pd

df['payment_date'] = pd.to_datetime(df['payment_date'])

df['churn_status'] = df['churn_status'].str.lower().map({'churned': 1, 'active': 0})

In [7]:
df['future_date'] = df['payment_date'] + pd.DateOffset(years=5)

df.sort_values(by=['user_id', 'payment_date'], inplace=True)

df['churn_in_5_years'] = df.apply(lambda row: 
    int((df[
        (df['user_id'] == row['user_id']) &
        (df['payment_date'] > row['payment_date']) &
        (df['payment_date'] <= row['future_date'])
    ]['churn_status'].sum()) > 0), axis=1)


In [8]:
df['renewed'] = df['renewed'].astype(int)

In [9]:
df['payment_date'] = pd.to_datetime(df['payment_date'])
df['valid_until'] = pd.to_datetime(df['valid_until'])

df['plan_duration_days'] = (df['valid_until'] - df['payment_date']).dt.days

df = pd.get_dummies(df, columns=['plan_type'], drop_first=True)

feature_cols = ['amount', 'plan_duration_days', 'renewed'] + \
               [col for col in df.columns if col.startswith('plan_type_')]

X = df[feature_cols]
y = df['churn_in_5_years']

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8438228438228438


In [11]:
def predict_churn_by_user_id(user_id):
    # Make sure user_id is int
    user_data = df[df['user_id'] == user_id].sort_values(by='payment_date', ascending=False).head(1)
    
    if user_data.empty:
        print(f"❌ User ID {user_id} not found.")
        print("\n✅ Available user_ids:")
        print(df['user_id'].unique())
        return

    row = user_data.iloc[0]

    # Prepare input
    input_data = dict.fromkeys(feature_cols, 0)
    input_data['amount'] = row['amount']
    input_data['plan_duration_days'] = (row['valid_until'] - row['payment_date']).days
    input_data['renewed'] = int(row['renewed'])

    # Handle plan_type one-hot
    plan_type_col = None
    for col in feature_cols:
        if col.startswith('plan_type_') and row.get(col, 0) == 1:
            input_data[col] = 1
            plan_type_col = col
            break

    user_df = pd.DataFrame([input_data])

    # Predict
    prediction = model.predict(user_df)[0]
    prob = model.predict_proba(user_df)[0][1]

    # Output
    print(f"\n🔍 Prediction for User ID {user_id}:")
    print("👉 Will CHURN in 5 years." if prediction == 1 else "✅ Will NOT churn in 5 years.")
    print(f"📊 Confidence: {prob:.2%}")
    if plan_type_col:
        print(f"📦 Plan Type: {plan_type_col.replace('plan_type_', '').title()}")


In [12]:
predict_churn_by_user_id(15005)


🔍 Prediction for User ID 15005:
✅ Will NOT churn in 5 years.
📊 Confidence: 24.90%
📦 Plan Type: Student Plan


In [13]:
df.head(20)

Unnamed: 0,payment_id,user_id,amount,payment_date,valid_until,churn_status,renewed,validity_days,future_date,churn_in_5_years,plan_duration_days,plan_type_Standard,plan_type_Student Plan
0,T67951961023,15000,12.99,2024-11-01,2024-12-31,0,1,60,2029-11-01,0,60,False,False
1,T68965430008,15000,12.99,2025-01-05,2025-03-06,0,1,60,2030-01-05,0,60,False,False
2,T56296958896,15000,7.99,2025-03-08,2025-04-07,0,1,30,2030-03-08,0,30,True,False
3,T77025305054,15001,7.99,2024-11-01,2024-12-01,0,1,30,2029-11-01,1,30,True,False
4,T73938493845,15001,7.99,2024-12-04,2025-01-03,1,0,30,2029-12-04,0,30,True,False
5,T80595234029,15002,12.99,2024-11-01,2024-12-31,0,1,60,2029-11-01,0,60,False,False
6,T73954774255,15002,12.99,2025-01-01,2025-03-02,0,1,60,2030-01-01,0,60,False,False
7,T61837475327,15002,7.99,2025-03-04,2025-04-03,0,1,30,2030-03-04,0,30,True,False
8,T71978840752,15003,12.99,2024-11-01,2024-12-31,0,1,60,2029-11-01,0,60,False,False
9,T71946924698,15003,12.99,2025-01-01,2025-03-02,0,1,60,2030-01-01,0,60,False,False
