In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

import acquire as a
import prepare as p

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
def get_mallcustomer_data():
    df = pd.read_sql('SELECT * FROM customers;', a.get_connection('mall_customers'))
    return df.set_index('customer_id')

In [3]:
df = get_mallcustomer_data()

In [4]:
df.head()

Unnamed: 0_level_0,gender,age,annual_income,spending_score
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Male,19,15,39
2,Male,21,15,81
3,Female,20,16,6
4,Female,23,16,77
5,Female,31,17,40


In [5]:
# from explore.py:
def train_validate_test_split(df, target, seed=1349):
    '''
    This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''
    train_validate, test = train_test_split(df, test_size=0.2, 
                                            random_state=seed)
    train, validate = train_test_split(train_validate, test_size=0.3, 
                                       random_state=seed)
    return train, validate, test

In [6]:
def scale_my_data(train, validate, test):
    scaler = MinMaxScaler()
    scaler.fit(train[['age', 'annual_income']])
    X_train_scaled = scaler.transform(train[['age', 'annual_income']])
    X_validate_scaled = scaler.transform(validate[['age', 'annual_income']])
    X_test_scaled = scaler.transform(test[['age', 'annual_income']])

    train[['age_scaled', 'income_scaled']] = X_train_scaled
    validate[['age_scaled', 'income_scaled']] = X_validate_scaled
    test[['age_scaled', 'income_scaled']] = X_test_scaled
    return train, validate, test

In [None]:
def prep_mall(df):
    '''
    dummy var for gender into is_male
    add 'spending_class' that cut spending score into the 4 quartiles and label the new field by [q1, q2, q3, q4]. 
    split on target of 'spending_score'
    scale age and annual income. 
    '''
    df['is_male'] = pd.get_dummies(df['gender'], drop_first=True)['Male']
    df = df.drop(columns=['gender'])
    df['spending_class'] = pd.qcut(df.spending_score, q=4, labels=['q1', 'q2', 'q3', 'q4'])
    train, validate, test = train_validate_test_split(df, target='spending_score', seed=1349)
    train, validate, test = scale_my_data(train, validate, test)
    return df, train, validate, test

In [7]:
df, train, validate, test = prep_mall(df)

**Goals of exploration**

- Can we see patterns, find signals in the data? 

- What features are driving the outcome?

- Are there other features we can construct that have stronger relationships? 

- Use Visualization and statistical testing to help answer these questions. 

- We want to walk away from exploration with with modeling strategies (feature selection, algorithm selection, evaluation methods, for example).  

**Agenda**

- Wrangle data (acquire, prep, summarize, split)

- Identify questions to answer, hypotheses to test.

In [None]:
# planned, acquired, prepped 
#we are exploring

In [8]:
train.head()

Unnamed: 0_level_0,age,annual_income,spending_score,is_male,spending_class,age_scaled,income_scaled
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
27,45,28,32,0,q1,0.519231,0.106557
24,31,25,73,1,q3,0.25,0.081967
40,20,37,75,0,q4,0.038462,0.180328
38,30,34,73,0,q3,0.230769,0.155738
57,51,44,50,0,q2,0.634615,0.237705


In [None]:
#in the scope of exploration:
# - frame questions with our tools:
# univariate exploration
# bivariate exploration
#multivariate exploration

In [None]:
# we will be creating bins for these features

In [None]:
# 1. univariate explorate
# what are the distribution of each variables?

In [None]:
p.distribution(df)

In [None]:
#notes:
#spending score looks monstly notrmal
# income and age appear to be skewed right
#note skew on age an income if binning
#slight class imbalance on is male


In [None]:
#exploration:
#
# Vizualization for exploration shoul be fast and infomrative.
#It serves the purpose of elaborating on relationshiop in you data and potential drivers of your target 

In [None]:
#2.Bivariate:
#Does spending score vary across genders?

In [None]:
#continous variable: spending score
#categorical variable : gender
# box plot to compare means visually
#do some statistical testingb

In [None]:
sns.boxplot(train.is_male, train.spending_score)
plt.title(' Potential difference in spending score across gender?')
plt.show()

In [None]:
# Null hypothesis: mean_ismale == mean_not_male
stats.levene(train[train.is_male==0].spending_score, train[train.is_male==1].spending_score)

In [None]:
stats.ttest_ind(train[train.is_male==0].spending_score, train[train.is_male==1].spending_score, equal_var=True)

we cannot reject the null hypothesis and will continue to assertthat there is not a signifficant difference in the meansbetween the two groups

In [None]:
# 3. Bivariate:
#is there a relationship between spending score and annual income?b

In [None]:
print('Relationship between Income and Spending Score?')
sns.jointplot(x='annual_income', y='spending_score', data=train)
plt.xlabel('Income')
plt.ylabel('Spending Score')
plt.show()


In [None]:
# note : looks like we might be able to create clusters from this variable (will explore more to fihure out how)
# no noted explicit linear correlation

In [None]:
# 4. Is there a relationship between spending score and age

In [None]:
plt.scatter(train.age, train.spending_score)
plt.xlabel('age')
plt.ylabel('Spending Score')
plt.title('Age vs Spending Score')
plt.show()

In [None]:
#bin age
train['age_bin'] =pd.cut(train.age, [0,40,80])

In [None]:
train.head()

In [None]:
sns.boxplot(train.age_bin, train.spending_score)
plt.title('Spending Score across age bins')
plt.show()

In [None]:
# levene H0 : equal variance across spending score between under 40 agroup and over 40 group
#alpha = 0.05
stats.levene(train[train.age <= 40].spending_score, train[train.age>40].spending_score)


In [None]:
# reject the H0, varicna is not equal

In [None]:
# H0: means_under_40 == mean_over_40
stats.ttest_ind(train[train.age <= 40].spending_score, train[train.age>40].spending_score, equal_var=False)

In [None]:
#REJECT THE NULL HYPOTHESIS: WE HAVE AN IMPLIED DIFFERENCE BETWEEN THE AGE GROUPS OF THE SPENDING SCORE
#CONCLUSION: Difference between the means, no significant r value to be conclude

In [None]:
#multivariate:
#5. If we control for age, does spending score differ across annual income

In [None]:
sns.scatterplot(x='annual_income', y = 'spending_score', data = train[train.age<=40])
plt.title('Income vs Spending Score for those under up through 40 years old')
plt.show()

In [None]:
sns.scatterplot(x='annual_income', y = 'spending_score', data = train[train.age>40])
plt.title('Income vs Spending Score for those over 40 years old')
plt.show()

In [None]:
# possibly quadratic relationchip on income of those over 40 years old?

In [None]:
sns.scatterplot(x='annual_income', y = 'spending_score', data = train[train.age<=30], color= 'blue')

sns.scatterplot(x='annual_income', y = 'spending_score', data = train[(train.age>30) & (train.age <= 40)], color ='red')
plt.title('Income vs Spending Score for those under 30 to those between 30 and 40  years old')
plt.show()

In [None]:
# gender effecting this?


In [None]:
sns.scatterplot(x='annual_income', y='spending_score', data = train[train.age<=40], hue ='is_male')
plt.title('Gender accounting for upper vs lower in age groups')
plt.show()

In [None]:
#possible underlying differences that we have not observed. Potential quadratic relationship observed earlier in age brackets to note

In [None]:
#6.if we control for income , does spending score differ across age?

In [None]:
train['income_bin'] = pd.cut(train.annual_income, [0,40,70,140])

In [None]:
train.head()

In [None]:
train.income_bin.value_counts()

In [None]:
sns.scatterplot(x='age', y='spending_score', data= train, hue = 'income_bin')
plt.title('Age to Spending Score controlling for Income Bins')
plt.show()

**takeaways**

- consistent spending score across age for middle class folks
- no strong observable correlation

In [None]:
#if you want to do a parplot with significant amount data:
#utilizw.sample() to take randomized subset of 

In [None]:
sns.pairplot(train, hue='age_bin')
plt.show()

In [None]:
sns.pairplot(train, hue='is_male')
plt.show()

In [None]:
sns.pairplot(train, hue='income_bin')
plt.show()

In [None]:
#out of we have observed from our overall questions:
# No notable correlations observed from questions pointed thus far
#significant means difference across age brackets for spending score (drivers not yet known)
#Potential groupings on income brackets that coul lead to spending score

In [None]:
#further exploration notes:
#see how we can leverage income brackets for modeling