Modules:
1. Scipy: scipy.stats

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import math

In [2]:
cust = pd.read_csv('cust_seg.csv')
cust.head()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
0,70,0,4,1,1,1,57,52,49.2,57.2
1,121,1,4,2,1,3,68,59,63.6,64.9
2,86,0,4,3,1,1,44,33,64.8,36.3
3,141,0,4,3,1,3,63,44,56.4,48.4
4,172,0,4,2,1,2,47,52,68.4,57.2


In [3]:
cust.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
custid                 200 non-null int64
sex                    200 non-null int64
AqChannel              200 non-null int64
region                 200 non-null int64
Marital_status         200 non-null int64
segment                200 non-null int64
pre_usage              200 non-null int64
Post_usage_1month      200 non-null int64
Latest_mon_usage       200 non-null float64
post_usage_2ndmonth    200 non-null float64
dtypes: float64(2), int64(8)
memory usage: 15.7 KB


In [4]:
cust.describe()

Unnamed: 0,custid,sex,AqChannel,region,Marital_status,segment,pre_usage,Post_usage_1month,Latest_mon_usage,post_usage_2ndmonth
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,100.5,0.545,3.43,2.055,1.16,2.025,52.23,52.775,63.174,58.0525
std,57.879185,0.49922,1.039472,0.724291,0.367526,0.690477,10.252937,9.478586,11.242137,10.426445
min,1.0,0.0,1.0,1.0,1.0,1.0,28.0,31.0,39.6,34.1
25%,50.75,0.0,3.0,2.0,1.0,2.0,44.0,45.75,54.0,50.325
50%,100.5,1.0,4.0,2.0,1.0,2.0,50.0,54.0,62.4,59.4
75%,150.25,1.0,4.0,3.0,1.0,2.25,60.0,60.0,70.8,66.0
max,200.0,1.0,4.0,3.0,2.0,3.0,76.0,67.0,90.0,73.7


In [5]:
import scipy.stats as stats

1. One-Sample t-test

In [6]:
stats.ttest_1samp(a=cust.Latest_mon_usage,popmean=50)

Ttest_1sampResult(statistic=16.57233752433133, pvalue=2.4963719280931583e-39)

In [7]:
cust.Latest_mon_usage.mean()

63.17400000000001

Accept the alternative hypothesis.

2. Two-Sample t-test

In [9]:
stats.ttest_rel(a=cust.pre_usage,b=cust.Post_usage_1month)

Ttest_relResult(statistic=-0.8673065458794775, pvalue=0.3868186820914985)

In [10]:
stats.ttest_rel(a=cust.pre_usage,b=cust.post_usage_2ndmonth)

Ttest_relResult(statistic=-8.866832246938742, pvalue=4.295733828012836e-16)

Conclusion: Post usage of second month is significantly diff from pre usage. So Campaign is successful after the first month.

3. Independent sample t-test

In [12]:
Males_spend = cust.Post_usage_1month[cust.sex == 0]
Females_spend = cust.Post_usage_1month[cust.sex == 1]

In [14]:
Males_spend.std()

10.305160697259263

In [15]:
Females_spend.std()

8.13371516959346

In [16]:
stats.ttest_ind(a=Males_spend,b=Females_spend,equal_var=False)

Ttest_indResult(statistic=-3.6564080478875276, pvalue=0.00034088493594266187)

Conclusion: Reject null, so males and females spend differently.

In [18]:
stats.f_oneway(Males_spend,Females_spend)

F_onewayResult(statistic=13.94330754080599, pvalue=0.0002462546120354903)

4. ANOVA

In [20]:
cust.segment.value_counts()

2    105
3     50
1     45
Name: segment, dtype: int64

In [21]:
s1 = cust.Latest_mon_usage[cust.segment == 2]
s2 = cust.Latest_mon_usage[cust.segment == 3]
s3 = cust.Latest_mon_usage[cust.segment == 1]

In [22]:
stats.f_oneway(s1,s2,s3)

F_onewayResult(statistic=29.27928380132178, pvalue=7.36401083352674e-12)

Conclusion: Segments are influencing the spend.

In [23]:
s1.mean()

68.08000000000003

In [24]:
s2.mean()

55.703999999999986

In [25]:
s3.mean()

60.026666666666685

5. Chi-Square test

In [27]:
t = pd.crosstab(cust.segment,cust.region,margins=True)
t

region,1,2,3,All
segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,16,20,9,45
2,19,44,42,105
3,12,31,7,50
All,47,95,58,200


In [28]:
stats.chi2_contingency(observed=t)

(16.60444164948934,
 0.055282939487992365,
 9,
 array([[ 10.575,  21.375,  13.05 ,  45.   ],
        [ 24.675,  49.875,  30.45 , 105.   ],
        [ 11.75 ,  23.75 ,  14.5  ,  50.   ],
        [ 47.   ,  95.   ,  58.   , 200.   ]]))

Conclusion: p value close to 0.05, so there is relationship between segment and region

6. Correlation

In [31]:
stats.pearsonr(x=cust.Latest_mon_usage,y=cust.pre_usage)

(0.6622801251558604, 1.2767419295068468e-26)

Conclusion: p value is less, so there is a relationship.

Here, corr=0.6622, so positive linear relationship.