<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/survival_scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survival EDA

## Importing libraries

In [None]:
# Updating libraries version
!pip install matplotlib --upgrade
!pip install plotly --upgrade

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#import plotly.express as px
from ipywidgets import interact

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data from csv to a pandas dataframe
import io
df1 = pd.read_csv(io.BytesIO(uploaded1['Script_202209130729.csv']), sep='|', engine='python')

In [None]:
# Chainging columns to lower case
df1.rename(columns={i:i.lower() for i in df1.columns}, inplace=True)

In [None]:
# Changing geographic column to title case
df1 = df1.assign(county_name = df1.loc[:,'county_name'].str[:15].str.title())

In [None]:
# Checking the dataframe info
df1.info()

## Feature Engineering

In [None]:
# Slicing dataframe according to dtype
def slicing_by_dtypes():
  ''' Creating subsets of the dataframe to separate between binary, float and categorical columns
  Returns tuple with 3 dataframes split in the following order: binary, numeric and object '''
  # Creating subset of binary columns
  df_bin = df1.select_dtypes(include=['float64']).copy()
  list_64 = []
  [list_64.append(df_bin.loc[:,i]) for i in df_bin.columns if len(df_bin.loc[:,i].unique()) > 3]
  [df_bin.drop(i, axis=1, inplace=True) for i in df_bin.columns if len(df_bin.loc[:,i].unique()) > 3]
  # Creating subset of numeric columns
  df_f64 = pd.concat(list_64, axis=1)
  # Creating subset of object columns
  df_obj = df1.select_dtypes(include=['object']).copy()
  return (df_bin, df_f64, df_obj)

df1_bi, df1_nu, df1_ob = slicing_by_dtypes()

In [None]:
# Checking sliced dataframes info
df1_bi.info()

In [None]:
# Filling NaN with 0 in binary columns
values_bi = {i:0 for i in df1_bi.columns}
df1_bi.fillna(value=values_bi, inplace=True)

In [None]:
# Creating binary columns from numeric data
for i in df1_nu.columns:
  df1_bi[f'has_{i}'] = np.where(df1_nu.loc[:,i]>0,1,0)

In [None]:
# Filling NaN with 'Unknown' object in categorical columns
values_ob = {i:'Unknown' for i in df1_ob.columns}
df1_ob.fillna(value=values_ob, inplace=True)

In [None]:
df1_nu.info()

In [None]:
# Concatenating binary and object dataframes
df1_a = pd.concat([df1_ob, df1_bi, df1.loc[:,['churn','customer_churn']]], axis=1)

In [None]:
# Checking the cleaned dataframe info
df1_a.info()

## Exploring Data

In [None]:
# Exploring list of columns with value counts
@interact(Column_name=df1_a.columns[1:], Percentage=[True, False])
def explore_value_counts(Column_name, Percentage):
  df = df1_a.value_counts(subset=[Column_name], normalize=Percentage, dropna=False)
  return df

In [None]:
# Describing each column
@interact(Column_name=df1_a.columns)
def explore_value_counts(Column_name):
  return df1_a.loc[:,[Column_name]].describe()

In [None]:
# Exploring number of categories in each categorical column
for cat in df1_a.columns:
  print(f'{cat} => {len(df1_a[cat].unique())}')

## Feature Importance

In [None]:
# Generating dummy variables from categorical columns
col_enc = df1_ob.columns.tolist()
df1_b = pd.get_dummies(df1_a, columns=col_enc)

In [None]:
df1_b.info()

In [None]:
# Splitting the dataframe into train and test
from sklearn.model_selection import train_test_split
X = df1_b.drop(['churn', 'customer_churn'], axis='columns')
y = df1_b.loc[:,'churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.7, random_state=17)

In [None]:
# Generating a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf1 = RandomForestClassifier()
rf1.fit(X_train, y_train)
y_predict = rf1.predict(X_test)
print(f'model accuracy : {round(accuracy_score(y_test, y_predict),4)}')

In [None]:
# Compare response prediction = 1
test = pd.DataFrame({'y-test': y_test, 'y-pred':y_predict})
test[test['y-pred']==1]

In [None]:
# Computing Feature Importance
df1_fi = pd.DataFrame(rf1.feature_importances_, index=X_train.columns,
                      columns=['importance']).reset_index().sort_values('importance', ascending=False)
df1_fi.rename(columns={'index':'variables'}, inplace=True)

In [None]:
# Assigning categories to common features
choice_list = ['commitment','on_net','volte','vowifi','fee','mb','topup','discount','lifetime','market_segment','phone','county','usage']
cond_list = [df1_fi['variables'].str.contains(i) for i in choice_list]
df1_fi = df1_fi.assign(category = np.select(cond_list, choice_list, 'other')).reset_index(drop=True)

In [None]:
# Aggregating common features
df1_gr = df1_fi.groupby(by=['category']).agg(total_importance = ('importance', 'sum')).reset_index()
df1_gr = df1_gr.sort_values(by='total_importance', ascending=False).reset_index(drop=True)

In [None]:
# Checking the final ranking of features by main category
df1_gr

In [None]:
# Exploring feature importance
@interact(Category=df1_gr['category'].unique())
def explore_feature_importance(Category):
  df = df1_fi[df1_fi['category'].str.contains(Category)]
  df = df.assign(perc_total = round(df['importance']/(df['importance'].sum()),2))
  return df

## Exporting results to local drive

In [None]:
# Generating current datetime
from datetime import datetime
file_name = 'feature_import_'
file_name +=(datetime.now().strftime("%Y%m%d%H%M"))
file_name

In [None]:
# Exporting files to excel
with pd.ExcelWriter(f'{file_name}.xlsx', engine='openpyxl') as writer:
  df1_a.to_excel(writer, sheet_name='Sheet 1', index=False)
files.download(f'{file_name}.xlsx')

## Visualizing Data

In [None]:
# Comparing feature importance in two subplots
#@interact(Category=df1_a.columns[1:-1])
def plot_subplots(Category):
  ''' Function for comparing two subplots '''
  df = df1_a[df1_a['churn']==1].copy().loc[:,['period','churn',Category,'customer_churn']].reset_index()
  #return df['period'][0]
  fig_x, axes = plt.subplots(1, 2, figsize=(15.5, 6.5))
  fig_x.suptitle(f'{Category}')
  axes[0].set_title(f'{Category} vs. Customer churn - {df["period"][0]}')
  axes[1].set_title(f'{Category} vs. Turnover churn - {df["period"][0]}')
  sns.countplot(data=df[df['customer_churn']==1], x=Category, ax=axes[0])
  sns.countplot(data=df[df['customer_churn']==0], x=Category, ax=axes[1])
  for i in range(2):
    for container in axes[i].containers:
      axes[i].bar_label(container, fontsize=12)
  plt.show()

In [None]:
# Exploring feature importance in same graph
@interact(Category=df1_a.columns[1:-2])
def plot_feature(Category):
  ''' Function for comparing total churn vs. customer churn in same graph '''
  df = df1_a[df1_a['churn']==1].copy().loc[:,['period','churn',Category,'customer_churn']].reset_index(drop=True)
  df = df.assign(churn_type = np.where(df['customer_churn']==1,'Customer','Turnover'))
  data_order = sorted(df[Category].unique().tolist())
  fig_x, axes = plt.subplots(1, 1, figsize=(7.5, 7.5))
  axes.set_title(f'{Category}: Customer vs. Turnover churn - {df["period"][0]}')
  sns.countplot(data=df, y=Category, hue='churn_type', order=data_order, ax=axes)
  for container in axes.containers:
    axes.bar_label(container, fontsize=12)
  #fig_x.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
  #plt.xticks(rotation=30)
  plt.show()

## Terminal

In [None]:
df1_a = df1_a.assign(terminal_name = df1_a['terminal_name'].str.lower())

In [None]:
check_list = ['iphone 4','iphone 5', 'iphone 6', 'iphone 7', 'iphone 8']
cond_list = [df1_a['terminal_name'].str.contains(j) for j in check_list]
choice_list = ['iphone 4','iphone 5', 'iphone 6', 'iphone 7', 'iphone 8']

In [None]:
cond_list[0].value_counts()

In [None]:
df1_a = df1_a.assign(terminal_type = np.select(cond_list, choice_list, 'other')).reset_index(drop=True)

In [None]:
sorted([i for i in df1_a.terminal_name.unique().tolist() if 'iphone' in i])

In [None]:
df1_a.terminal_type.value_counts()