## SCIPY
- Numpy provoides high-performance multidimensional array
- SciPy builds on this, and provides a large number of functions that operate on numpy arrays and are useful for different types of scientific and engineering applications

In [None]:
from scipy import stats
import math

In [None]:
# Hypothesis testing, import the stats package for calculating the probability
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from matplotlib.pylab import rcParams
plt.rcParams["figure.figsize"]=(8,4)

### Z-Test

In [None]:
# Null Hypotheis - Mu >= 20, lets compute the test statistics
# xbar = 18  # sample mean
# mu0  >= 30  # Hypothesized value
# sigma = 6  # population standard dev
# n = 36       # sample size

# From a population with mean=30, sd=6, probability of getting sample mean = 18 with sample size =30

# stats.norm.cdf(sample mean, Hypothesized value, sigma/math.sqrt(n))

stats.norm.cdf(28, 30, 6/math.sqrt(36)) #


### Paired T-test 

In [None]:
rack1 = pd.DataFrame({'rack':'rack1', 'sales': np.random.normal( 79, 10, 30).astype(int)})
rack2 = pd.DataFrame({'rack':'rack2', 'sales': np.random.normal( 85, 10, 30).astype(int)})
rack3 = pd.DataFrame({'rack':'rack3', 'sales': np.random.normal( 90, 10, 30).astype(int)})
sales_data = pd.concat([rack1, rack2, rack3])

sales_data.head(10)


In [None]:
# Get the average sales for each Rack by using Groupby
sales_data.groupby('rack').sales.mean()

In [None]:
sns.barplot(x='rack', y='sales', data=sales_data, ci=None )

#### Paired T-test   ( Rack1 vs Rack2) ,    H0 =>  mu1 = mu2

In [None]:
rack1_sales = sales_data[sales_data.rack=='rack1'].sales
rack2_sales = sales_data[sales_data.rack=='rack2'].sales

stats.ttest_ind(rack1_sales, rack2_sales)  # H0 => mu1 = mu2 
# # P=0.39 > 0.05, we can not reject the null hypothesis that that both the racks have similar influence on sales

#### Paired T-test   ( Rack2 vs Rack3) ,    H0 =>  mu1 = mu2

In [None]:
rack2_sales = sales_data[sales_data.rack=='rack1'].sales
rack3_sales = sales_data[sales_data.rack=='rack3'].sales

stats.ttest_ind(rack2_sales, rack3_sales)  # H0 => mu1 = mu2 
# # P=0.02 < 0.05, we can reject the null hypothesis that that both the racks have similar influence on sales

#### ANOVA Test (Rack1 vs Rack2 vs Rack3),  H0 : mu1 = mu2 = mu3
- ANOVA F-Test to test if the differrences among the group means are statisticall significant or it is just due to sampling variablitiy.

In [None]:
rack3_sales.head()

In [None]:
# Get the average sales for each Rack by using Groupby
sales_data.groupby('rack').sales.mean()

In [None]:
rack3_sales

In [None]:
stats.f_oneway(rack1_sales, rack2_sales, rack3_sales) # H0 => mu1 = mu2 = mu3
# p=0.004 < 0.05, we can reject the null hypothesis that all the racks have similar influence on sales

In [None]:
# ANOVA F-Test to test if the differrences among the group means are statisticall significant or it is just due to sampling variablitiy.
# H0: mu1=mu2=mu3   , F = (Variations among group means) / (Variation within group)

import statsmodels.formula.api as smf
model = smf.ols(formula='sales ~ rack', data=sales_data)
results = model.fit()
print(results.summary())  # F Statistics (p-value=.0048 < .05) => we can reject null hypothesis that avg sales are same for each Rack.
                          # We can conclude that there is a relationship between categorical var Rack and Demands(product sales).

### Chisq test  :  Test of independence (Categorical Variables)

In [None]:
input_data = pd.read_excel('C:\\Users\\jp\\Desktop\\testData\\titanic_train.xlsx')

# Meta data - https://www.kaggle.com/c/titanic/data
# Predict the Survival - complete the analysis of what sorts of people were likely to survived. ( Disaster Survival prediction)

In [None]:
input_data.head(5)

In [None]:
input_data.info()

In [None]:
input_data['Survived'] = input_data['Survived'].astype('object')
input_data['Pclass'] = input_data['Pclass'].astype('object')

####  Gender vs Survival

In [None]:
cross_tab_d = pd.crosstab( input_data.Gender, input_data.Survived)
cross_tab_d

In [None]:
sns.countplot(x='Survived', hue='Gender', data=input_data)

#### Looks like there is a trend, people did not survived are much more likely to Male

- We can run the **Chi-square test** to check if there is a relationship

In [None]:
chi2, p, ddof, expected = stats.chi2_contingency(cross_tab_d.values) # H0 = gender and Survival capability are independent 

msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n" # p-value < .05 => reject the H0
print( msg.format( chi2, p, ddof ) )
print( expected )

#### Passender Class ( socio-economic status) vs Survival 
- Pclass -  ( 1 = 1st, 2 = 2nd, 3 = 3rd ) 

In [None]:
cross_tab_d = pd.crosstab( input_data.Pclass, input_data.Survived)
cross_tab_d

In [None]:
sns.countplot(x='Pclass', hue='Survived', data=input_data)

#### Looks like there is a trend, the people who did not survived are mostly from the Third class.

In [None]:
chi2, p, ddof, expected = stats.chi2_contingency(cross_tab_d.values) # H0 = gender and Survival capability are independent 

msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n"    # p-value < .05 => reject the H0
print( msg.format( chi2, p, ddof ) )
print( expected )

#### Chi-square test to check if Gennder and working-preference are independent

In [None]:
df_test = pd.DataFrame({'gend' : ['M','F','M','F','F','F','F','M','F','M', 'F'], 'dept' : ['Fin','Fin','Soft','Fin','Soft','Soft','Soft','Fin','Soft','Fin','Soft']})
df_test

In [None]:
import pandas as pd
cross_tab_d = pd.crosstab( df_test.gend, df_test.dept)
cross_tab_d

In [None]:
sns.countplot(x='dept', hue='gend', data=df_test)

In [None]:
chi2, p, ddof, expected = stats.chi2_contingency(cross_tab_d.values) # H0 = gender and department preference are independent 

msg = "Test Statistic: {}\np-value: {}\nDegrees of Freedom: {}\n" #  p-value > .05 => Can not reject the H0
print( msg.format( chi2, p, ddof ) )
print( expected )

#### Distance Computation using SCIPY

In [None]:
# Eucleadean Distance
from scipy.spatial import distance

In [None]:
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.random.randn(4,3), columns=['a','b','c'], index=['e1','e2','e3','e4'])
df1

In [None]:
# similarity between e1 and e2 using eucleadean distance

In [None]:
df1.loc['e1']

In [None]:
distance.euclidean(df1.loc['e1'], df1.loc['e2'])
#distance.euclidean(df1.loc['e2'], df1.loc['e3'])

In [None]:
# calculating the distances 
from scipy.spatial.distance import pdist, squareform

In [None]:
distances = pdist(df1.values, metric='euclidean')  
distances                      #  The result is a "flat" array that consists only of the upper triangle of the distance matrix.

In [None]:
dist_matrix = squareform(distances)
dist_matrix