In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# Trial Mailing Campaign

## Import CSV to Google Colab

In [None]:
df = pd.read_csv('../input/trialPromoResults.csv')
df.head()

In [None]:
df.columns.values

In [None]:
# rename columns e.g. ' sex' to 'sex'
df.columns = ['index', 'sex', 'mstatus', 'age', 'children', 'occupation', 'education', 'income', 'avbal', 'avtrans', 'decision']

# Data Exploration

## Target variable exploration - Decision

In [None]:
df['decision'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 3))

df['decision'].value_counts().plot.barh(ax=ax, color=['tab:gray', 'tab:blue', 'tab:orange'])
ax.invert_yaxis()
ax.set_title('Decision', fontsize=15)
ax.set_xlabel('Count')
plt.show()

## Input variable exploration

### Sex

In [None]:
df['sex'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 2))

df['sex'].value_counts().plot.barh(ax=ax, color=['tab:pink', 'tab:brown'])
ax.invert_yaxis()
ax.set_title('Gender', fontsize=15)
ax.set_xlabel('Count')
plt.show()

In [None]:
f, ax = plt.subplots(figsize=(7, 3))
sns.countplot(y="sex", data=df, palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

### Marital Status

In [None]:
# Male Marital Status
df['mstatus'].loc[df['sex'] == 'M'].value_counts()

In [None]:
# Female Marital Status
df['mstatus'].loc[df['sex'] == 'F'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(y="mstatus", data=df, palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

In [None]:
# Male Marital Status and investment preference
f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(y="mstatus", data=df.loc[df['sex']=='M'], palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

In [None]:
# Female Marital Status and investment preference
f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(y="mstatus", data=df.loc[df['sex']=='F'], palette=['tab:gray', 'tab:blue', 'tab:orange'], hue='decision')

### Age

In [None]:
df['age'].describe()

In [None]:
bins = np.linspace(15, 100, 50)

plt.hist(df['age'].loc[df['decision'] == 'None'], bins, label='None', color='tab:gray')
plt.hist(df['age'].loc[df['decision'] == 'A'], bins, label='A', color='tab:blue')
plt.hist(df['age'].loc[df['decision'] == 'B'], bins, label='B', color='tab:orange')

plt.legend(loc='upper right')
plt.show()

### Children

In [None]:
df['children'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(y="children", data=df, palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

### Occupation

In [None]:
df['occupation'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 10))
sns.countplot(y="occupation", data=df, palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

### Education

In [None]:
df['education'].value_counts()

In [None]:
f, ax = plt.subplots(figsize=(7, 5))
sns.countplot(y="education", data=df, palette=['tab:gray', 'tab:orange', 'tab:blue'], hue='decision')

### Income

In [None]:
df['income'].describe()

In [None]:
bins = np.linspace(300, 20000, 50)

plt.hist(df['income'].loc[df['decision'] == 'None'], bins, label='None', color='tab:gray')
plt.hist(df['income'].loc[df['decision'] == 'A'], bins, label='A', color='tab:blue')
plt.hist(df['income'].loc[df['decision'] == 'B'], bins, label='B', color='tab:orange')

plt.legend(loc='upper right')
plt.show()

In [None]:
#skewness
print("Skewness: %f" % df['income'].skew())
#histogram and normal probability plot
sns.distplot(df['income'], fit=norm);
fig = plt.figure()
res = stats.probplot(df['income'], plot=plt)

In [None]:
#skewness
print("Skewness: %f" % np.log(df['income']).skew())
#apply log-transform on the 'price_doc' target variable
#histogram and normal probability plot
sns.distplot(np.log(df['income']), fit=norm);
fig = plt.figure()
res = stats.probplot(np.log(df['income']), plot=plt)

### Average Monthly Balance

In [None]:
df['avbal'].describe()

In [None]:
bins = np.linspace(300, 20000, 50)

plt.hist(df['avbal'].loc[df['decision'] == 'None'], bins, label='None', color='tab:gray')
plt.hist(df['avbal'].loc[df['decision'] == 'A'], bins, label='A', color='tab:blue')
plt.hist(df['avbal'].loc[df['decision'] == 'B'], bins, label='B', color='tab:orange')

plt.legend(loc='upper right')
plt.show()

In [None]:
#skewness
print("Skewness: %f" % df['avbal'].skew())
#histogram and normal probability plot
sns.distplot(df['avbal'], fit=norm);
fig = plt.figure()
res = stats.probplot(df['avbal'], plot=plt)

In [None]:
#skewness
print("Skewness: %f" % np.log(df['avbal']).skew())
#histogram and normal probability plot
sns.distplot(np.log(df['avbal']), fit=norm);
fig = plt.figure()
res = stats.probplot(np.log(df['avbal']), plot=plt)

### Average Monthly Transaction

In [None]:
df['avtrans'].describe()

In [None]:
bins = np.linspace(300, 20000, 50)

plt.hist(df['avtrans'].loc[df['decision'] == 'None'], bins, label='None', color='tab:gray')
plt.hist(df['avtrans'].loc[df['decision'] == 'A'], bins, label='A', color='tab:blue')
plt.hist(df['avtrans'].loc[df['decision'] == 'B'], bins, label='B', color='tab:orange')

plt.legend(loc='upper right')
plt.show()

In [None]:
#skewness
print("Skewness: %f" % df['avtrans'].skew())
#histogram and normal probability plot
sns.distplot(df['avtrans'], fit=norm);
fig = plt.figure()
res = stats.probplot(df['avtrans'], plot=plt)

In [None]:
#skewness
print("Skewness: %f" % np.log(df['avtrans']).skew())
#histogram and normal probability plot
sns.distplot(np.log(df['avtrans']), fit=norm);
fig = plt.figure()
res = stats.probplot(np.log(df['avtrans']), plot=plt)