## Exploratory Data Analysis

### EDA for Client demographic data

#### Import necessary libraries

In [4]:
# Load necessary libraries
import pandas as pd
import numpy as np
import scipy as sc
from collections import defaultdict
from datetime import datetime
import time
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [8]:
# Load the data
demog = pd.read_csv('demog.csv')
demog.head(2)

Since our data is more or less clean we will start exploring by deviding it in to numerical and catagorical variables. So, We will create one for each variable. 

In [None]:
# Drop unnamed: 0 column 
#demo = demog.drop('Unnamed: 0', axis = 'columns')
#demo.head(2)

In [None]:
demog.shape

In [None]:
# Extract column names with numerical datatype
demog.select_dtypes("number").columns

In [None]:
# Count and sort the unique values of each numeriacl variables
demo_num = demog.select_dtypes("number").nunique().sort_values(ascending = False)
demo_num

Lat's take gendr and do the exploration.

In [None]:
# Frequency table for grnder
freq_tab = demog['gendr'].value_counts()
freq_tab

In [None]:
prop_table = pd.crosstab(index = demog['gendr'], columns = "count")
(prop_table/prop_table.sum()).round(2)

Insight: * The most common gender classification is Male, Female, and Undisclosed. That comprising approximately 34%, 34% and 32% respectively.
         
         * The last which is the least frequent gender is Unspecified, making up only 3.

In [None]:
#ax=sns.barplot(y=prop_table.index, x=freq_tab.values, hue=freq_tab.index)
#sns.move_legend(ax, "lower right")
sns.barplot(y = freq_tab.values, x = prop_table.index, hue = freq_tab.index)
plt.show()

From the above chart we observe the same result as the value counts. Three of the gender group covers almost the whole portion, which is Female, Male and Undisclosed. 

Now let's move in to the numerical Vs Catagorical variables. And answer the questions like: 
1. Who are the primary clients using this online process? From the above analysis we can say that; the primary clients in gender wise are Female, Male and Undisclosed.2. 
Are the primary clients younger or older, new or long-standing? From the below graph we can understand that most of the clients are young and middle age. Most of them are in the year between 6 to 21 since they have joined. And the rest have been staying between 36 to 62 years.

In [None]:
demo_num.head()

In [None]:
# The top 20 who stayed longer and lesser 
top20_year_long = demog['clnt_tenure_yr'].value_counts().head(20)
# Top 20 who stayed lesser
top20_year_new = demog['clnt_tenure_yr'].value_counts().sort_values().head(20)
top20_year_long

In [None]:
conditions = [
    (demog['clnt_age']>=14) & (demog['clnt_age']<=30),
    (demog['clnt_age']>=31) & (demog['clnt_age']<=45),
    (demog['clnt_age']>=46) & (demog['clnt_age']<=60),
    (demog['clnt_age']>=61) & (demog['clnt_age']<=75),
    (demog['clnt_age']>=76) & (demog['clnt_age']<=96),
]

values = ['14-30','31-45','46-60','61-75','76-96']
demog['age_category'] = np.select(conditions, values)

In [None]:
demog.head()

In [None]:
pd.crosstab(demog['age_category'],demog['gendr']).plot.bar()
plt.show()

In [None]:
demog['num_accts'].value_counts().sort_values()

In [None]:
#Let's see gender by number of accounts.
pd.crosstab(demog['num_accts'],demog['gendr']).plot.bar()
plt.show()

In [None]:
demog['num_accts'].value_counts().plot.pie(startangle=90, colors=sns.color_palette("Set3"));
plt.show()

### EDA for Web_Data

In [None]:
web_data = pd.read_csv('web_data.csv')
web_data.head(2)

In [None]:
#web_data = web_data.drop('Unnamed: 0', axis = 'columns')
#web_data.head(2)

In [None]:
web_data['process_step'].value_counts()

In [None]:
web_data.shape

In [None]:
web_data.nunique()

Let's calculate the time difference.  

Now let us extract the confirmed and start processes step and save it in to csv file.

In [None]:
pro_start = web_data[web_data["process_step"] == "start"]
pro_start.shape

In [None]:
pro_confirm = web_data[web_data["process_step"] == "confirm"]
pro_confirm.shape

In [None]:
# Now let's save both files in to csv format
pro_start.to_csv("pro_start.csv")
pro_confirm.to_csv("pro_confirm.csv")

### EDA for final experement client data

In [None]:
final_ex = pd.read_csv('final_ex.csv')
final_ex.head()

In [None]:
# Drop the Unnamed: 0 column
#final_ex.head(2)

In [None]:
final_ex.nunique()

In [None]:
final_ex['variation'].value_counts()

Let's devide the dataset in to two \Test and Control group\

In [None]:
test_group = final_ex[final_ex["variation"] == "Test"]
test_group.shape

In [None]:
# Since we will merge the datasets let's change the column names of variations accordingly
test_group.columns = ['client_id', 'variation_test']
test_group.head(2)

In [None]:
control_group = final_ex[final_ex["variation"] == "Control"]
control_group.shape

In [None]:
control_group.columns = ['client_id', 'variation_control']
control_group.head(2)

In [None]:
# Let's save them to csv file
test_group.to_csv("test_group.csv")
control_group.to_csv("control_group.csv")

Let's merge the data first for test group then control group

In [None]:
#Let us merge the dataset based on client_ID for test group and demog
merge_1 = pd.merge(demog, test_group, on='client_id')

In [None]:
# Let us check our dataset
merge_1.head(2)

In [None]:
merge_1.shape

In [None]:
# now let's merge the dataset of merge_1 and web_data
test_merge = pd.merge(web_data, merge_1, on = 'client_id')

In [None]:
# Lets check the dataset
test_merge.head(2)

In [None]:
test_merge.shape

In [None]:
# Let's save the dataset of test_merge
test_merge.to_csv("test_merge.csv")

In [None]:
test_merge['process_step'].value_counts()

Let us move to control group

In [None]:
# First we merge control_group with demog
merge_2 = pd.merge(demog, control_group, on = 'client_id')

In [None]:
merge_2.head(2)

In [None]:
# Now we will merge merge_2 dataset and web_data
control_merge = pd.merge(web_data, merge_2, on = 'client_id')

In [None]:
control_merge.head(2)

In [None]:
control_merge.shape

In [None]:
# Now save it in to csv file
control_merge.to_csv("control_merge.csv")

In [None]:
control_merge['process_step'].value_counts()

#### Let's sort the control group (control_merge) dataframe based on client_id, visit_id, Process_step, date, time.

In [None]:
# First let's list the columns name 
list(control_merge.columns.values.tolist())

In [None]:
control_sorted = control_merge.sort_values(['client_id','visit_id', 'process_step', 'date', 'time'], ascending = [True, True, True, True, True])

In [None]:
control_sorted.head(2)

In [None]:
# Save control_sorted data
control_sorted.to_csv("control_sorted.csv")

#### Let's sort the test group (test_merge) dataframe based on client_id, visit_id, Process_step, date, time.

In [None]:
test_sorted = test_merge.sort_values(['client_id', 'visit_id','process_step','date', 'time'], ascending = [True, True, True, True, True])

In [None]:
test_sorted.head(2)

In [None]:
# Save test sorted data
test_sorted.to_csv("test_sorted.csv")

In [None]:
# store unique client_id
client_id_unique = []
client_id_unique.append(control_sorted['client_id'].unique())

In [None]:
# display the unique client_id's
# Get unique client_ids
unique_client_ids = control_sorted['client_id'].unique()
# Iterate over each unique client_id
for client_id in unique_client_ids:
    # Extract rows based on the current client_id
    client_rows = control_sorted[control_sorted['client_id'] == client_id]

In [None]:
def att_conf(unique_client_ids):
    for client_id in unique_client_ids:
       attempt = control_sorted[control_sorted['client_id'] == client_id] # Extract the rows for each client_Id
       len_attempt = len(control_sorted[control_sorted['client_id'] == client_id]) # Sum of how many times apper each client_id in the dataframe
       num_attempt = len_attempt/5 # How many events(True or False) are there in each 5 steps (how many full process were there?)
       num_confirm = len(attempt [attempt['process_step'] == 'confirm']) # How many confirm events are there in a specific client_id
       print(client_id, ',' , num_attempt,',', num_confirm)

Now we take the number of confirm events and analyse it to Shows that there were no success then we will consider there were 9 attemptes 
that was unsccessful and, we will add 2 False value to the 'SubmissionSuccessful' column. And, we do the same for each client_id. 

We get and determine how many number of True or False we should include in submission_successful column. Number of confirm will tell us how many True events and to get False events we subtract Number of confirm from number of attempts. Then, we collect True and  False values to the submission_successful list.

In [None]:
# Here we will extract client_id with the date_time in 5 steps of interval.
df_error = control_sorted[['client_id','date_time']]
extracted_rows = df_error.iloc[::5]
extracted_rows.head()

In [None]:
# Let's import our unique client_id with their attempt and confirm event 
control_unique_id = pd.read_csv('cotrol_unique_client_id.csv')
control_unique_id.head()

In [None]:
control_unique_id.shape

In [None]:
control_unique_id['is_confirmation'].value_counts()

In [None]:
# Let's merge the dataset
df_err_rate = pd.merge(extracted_rows, control_unique_id, on = "client_id")
df_err_rate.head()

In [None]:
df_err_rate.shape

In [None]:
df_err_rate['is_confirmation'].value_counts()

In [None]:
total_submissions = 28097
confirmed_submissions = 21610
error_submissions = 6487
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)
print(error_rate, "||", confirmation_rate )

Let us check and verify with the unique client_id.

In [None]:
total_submission = 23526
confirmed_submission = 15428
error_rate = 8098
error_rate = round((error_submissions / total_submissions) * 100)
confirmation_rate = round((confirmed_submissions / total_submissions) * 100)

In [None]:
print(error_rate, "||", confirmation_rate )