**Dataset**\
Each row in the dataset is anonymized real policy data. A policy is considered churned if the difference between the start and end date is less than 1 year.

**Problem**\
Develop a model to predict a policy's probability to churn at time of sale.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_auc_score


In [None]:
df = pd.read_csv(os.getcwd() + '/churn_data.csv', parse_dates=['start_date','end_date'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
plt.boxplot(df['years_of_experience'])

In [None]:
df = df[df['years_of_experience'] < 100] 
plt.boxplot(df.loc[df['years_of_experience'], 'years_of_experience'])

In [None]:
df.start_date.max()

In [None]:
df.start_date.min()

In [None]:
df.end_date.min()

In [None]:
df.end_date.max()

In [None]:
today = datetime.date.today()

In [None]:
df.loc[df['end_date'] > today, 'end_date']

In [None]:
df['start_plus_1'] = df['start_date'] + pd.DateOffset(years=1)
df.loc[df['end_date'] > df['start_plus_1'],]

In [None]:
for col in ['channel', 'geo', 'profession_id', 'cancellation_reason']:
    print(df[col].value_counts())
    print(len(df[col].value_counts()))

In [None]:
cols = ['channel', 'geo']
le = preprocessing.LabelEncoder()
for col in cols:
    df[f'{col}_transformed'] = le.fit_transform(df[col].fillna('unknown'))
df.head()

In [None]:
df['churned'] = np.where(df['start_plus_1'] > df['end_date'], 1, 0)
df.head()

In [None]:
df['churned'].value_counts()

In [None]:
19123 / (19123 + 10876)

In [None]:
df['random'] = np.random.random(size=df.shape[0])
df.head()

In [None]:
np.mean(df['random'])

In [None]:
X = df[['years_of_experience', 'channel_transformed', 'geo_transformed', 'profession_id']]
y = df.churned

In [None]:
X.isna().sum()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=17, test_size=0.2)

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)


In [None]:
df['yhat'] = rf.predict(X)
test_df = pd.concat([X_test, y_test], axis=1)
test_df['yhat'] = rf.predict_proba(X_test)[:,1]


In [None]:
test_df.head()

In [None]:
roc_auc_score(test_df['churned'], test_df['yhat'])

In [None]:
roc_auc_score(df['churned'], df['random'])

In [None]:
train_df = pd.concat([X_train, y_train], axis=1)
train_df['yhat'] = rf.predict_proba(X_train)[:,1]
roc_auc_score(train_df['churned'], train_df['yhat'])

In [None]:
rf2 = RandomForestClassifier(max_depth=5)
rf2.fit(X_train, y_train)


In [None]:
df['yhat2'] = rf2.predict(X)
test_df['yhat2'] = rf2.predict_proba(X_test)[:,1]


In [None]:
roc_auc_score(test_df['churned'], test_df['yhat2'])