<a href="https://colab.research.google.com/github/gabrielborja/parc_de_montjuic/blob/main/survival_scripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Survival EDA

## Importing libraries

In [None]:
# Updating libraries version
!pip install matplotlib --upgrade
!pip install plotly --upgrade

In [None]:
# Importing python libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from ipywidgets import interact

## Loading Data

In [None]:
# Loading data from local drive
from google.colab import files
uploaded1 = files.upload()

In [None]:
# Storing loaded data from csv to a pandas dataframe
import io
df1 = pd.read_csv(io.BytesIO(uploaded1['Script_202209071534.csv']), sep='|', engine='python')

In [None]:
# Renaming columns
df1.rename(columns={df1.columns[3]:'SUBSCRIBER_ON_NET',df1.columns[21]: "COUNTY_NAME"}, inplace=True)

In [None]:
# Chainging columns to lower case
df1.rename(columns={i:i.lower() for i in df1.columns}, inplace=True)

In [None]:
# Checking the dataframe info
df1.info()

## Feature Engineering

In [None]:
# Filling NA with 0 in binary columns
values_bi = {i:0 for i in df1.iloc[:,1:10].columns}
df1.fillna(value=values_bi, inplace=True)

In [None]:
# Filling NA with object in categorical columns
values_cat = {df1.columns[15]:'No data usage', df1.columns[19]:'No data usage', df1.columns[21]:'Unknown',
              df1.columns[22]:'No voice usage'}
df1.fillna(value=values_cat, inplace=True)

In [None]:
# Creating binary columns from numeric data
df1 = df1.assign(has_fee = np.where(df1.iloc[:,10]>0,1,0),
                 has_mb_to_l3m = np.where(df1.iloc[:,12]>0,1,0),
                 has_topup_cu_mo = np.where(df1.iloc[:,13]>0,1,0),
                 has_topup_pv_mo = np.where(df1.iloc[:,14]>0,1,0))
#df1.iloc[:,[13,25]].head(25)

In [None]:
# Changing geographic column to title case
df1 = df1.assign(county_name = df1.iloc[:,21].str[:15].str.title())

In [None]:
# Slicing dataframe with selected columns
df1_a = df1.iloc[:,np.r_[1:10,24:28,15:24]]

In [None]:
# Checking the cleaned dataframe info
df1_a.info()

## Exploring Data

In [None]:
# Exploring list of columns with value counts
@interact(Column_name=df1.columns[1:], Percentage=[True, False])
def explore_value_counts(Column_name, Percentage):
  df = df1.value_counts(subset=[Column_name], normalize=Percentage, dropna=False)
  return df

In [None]:
# Exploring each numeric column
@interact(Column_name=df1_a.columns)
def explore_value_counts(Column_name):
  return df1_a.loc[:,[Column_name]].describe()

In [None]:
# Exploring number of categories in each categorical column
for cat in df1_a.columns:
  print(f'{cat} => {len(df1_a[cat].unique())}')

## Feature Importance

In [None]:
# Generating dummy variables from categorical columns
col_enc = df1_a.columns[13:21].tolist()
df1_b = pd.get_dummies(df1_a, columns=col_enc)
df1_b.head(3)

In [None]:
# Splitting the dataframe into train and test
from sklearn.model_selection import train_test_split
X = df1_b.drop('churn', axis='columns')
y = df1_b.loc[:,'churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=17)

In [None]:
# Generating a random forest model
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf1 = RandomForestClassifier()
rf1.fit(X_train, y_train)
y_predict = rf1.predict(X_test)
print(f'model accuracy : {round(accuracy_score(y_test, y_predict),4)}')

In [None]:
test = pd.DataFrame({'y-test': y_test, 'y-pred':y_predict})
test[test['y-pred']==1]

In [None]:
# Computing Feature Importance
df1_fi = pd.DataFrame(rf1.feature_importances_, index=X_train.columns,
                      columns=['importance']).reset_index().sort_values('importance', ascending=False)
df1_fi.rename(columns={'index':'variables'}, inplace=True)

In [None]:
df1_a.info()

In [None]:
# Assigning categories to common features
choice_list = ['commitment','on_net','volte','vowifi','fee','mb','topup','discount','lifetime','market_segment','phone','county','usage']
cond_list = [df1_fi['variables'].str.contains(i) for i in choice_list]
df1_fi = df1_fi.assign(category = np.select(cond_list, choice_list, 'other')).reset_index(drop=True)
#df1_fi['category'].replace(to_replace=r'market', value='mk_segment', regex=True, inplace=True)

In [None]:
# Aggregating common features
df1_gr = df1_fi.groupby(by=['category']).agg(total_importance = ('importance', 'sum')).reset_index()
df1_gr = df1_gr.sort_values(by='total_importance', ascending=False).reset_index(drop=True)

In [None]:
# Checking the final ranking of features by main category
df1_gr

In [None]:
# Exploring feature importance
@interact(Category=df1_gr['category'].unique())
def explore_feature_importance(Category):
  df = df1_fi[df1_fi['category'].str.contains(Category)]
  df = df.assign(perc_total = round(df['importance']/(df['importance'].sum()),2))
  return df

## Exporting results to local drive

In [None]:
# Generating current datetime
from datetime import datetime
file_name = 'feature_import_'
file_name +=(datetime.now().strftime("%Y%m%d%H%M"))
file_name