# Lesson 1

## Library loading

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 

## Feature selection based on variance.

Here we will use variance threshold to remove some features that very low variance, ie those columns have almost the same values in all the rows. Hence such features do not explain anything about the target variable 

In [2]:
numerical = pd.read_csv('numerical.csv')
categorical = pd.read_csv('categorical.csv')
targets = pd.read_csv('target.csv')

In [3]:
from sklearn.feature_selection import VarianceThreshold

Remember the definition of the variance:

$\sigma_{col}^{2}=\frac{1}{n}\sum_{i=1}^{n}(x_{i}-\bar{X})^2$

In [4]:
sel = VarianceThreshold(threshold=(.9))
temp = sel.fit_transform(numerical)
temp = pd.DataFrame(temp)
print(numerical.shape)
print(temp.shape)

(95412, 315)
(95412, 305)


As it can be clearly seen, the VarianceThreshold method reduces the size of the dataframe droping those columns for which the variance is smaller than 0.9.

To check which columns were removed we can do:

In [5]:
list(sel.get_support())

[True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 False,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True,
 True

In [6]:
list(sel.variances_)

[909809.7483785483,
 208.8158971989076,
 2.8881932699556185,
 7.320489004283561,
 86.61745725511418,
 25.65924724942382,
 131.57223253672964,
 227.9397116710221,
 313.6076948242227,
 19.286818754804564,
 26.281939824591404,
 17.265241830723728,
 8.311701637927996,
 32984198.86101045,
 2126043.3424667185,
 4507490.409069261,
 2249.6547195922,
 974.8079246455828,
 1603.0129942634196,
 30.97534865272938,
 33.27364006327841,
 441.57831545183956,
 278.5789169921517,
 12.03604563865652,
 49.980973211488966,
 190.05348704784367,
 0.4614096996326538,
 4.9825142559714894,
 6.419788645235147,
 5.332371515424055,
 1.007061136251383,
 1.1111593939063544,
 1.368116911416873,
 128.44765349637763,
 1.6713672732722233,
 10.295326351943883,
 11.32502530691183,
 69.48179133914489,
 68.05635235184752,
 65.75686276277202,
 52.72468559024539,
 48.441176213956375,
 47.3710295874623,
 56.47911443932282,
 67.7692564686937,
 40.71093052337281,
 33.95626692614386,
 36.82299864542428,
 60.64881975087444,
 38.576

In [7]:
my_list_of_tuples = [(index,value) for (index,value) in enumerate(list(sel.variances_)) if value < 0.9]

In [8]:
cols_nul_variance =  [list(numerical.columns)[index] for index in [x[0] for x in my_list_of_tuples]]
cols_nul_variance

['ETH6',
 'TPE6',
 'TPE7',
 'ANC5',
 'ANC6',
 'ANC11',
 'ANC15',
 'HC15',
 'MHUC2',
 'HPHONE_D']

## Activity 1

Fill the missing code:


In [9]:
X = numerical
y = targets['TARGET_B']

In [10]:
targets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95412 entries, 0 to 95411
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET_B  95412 non-null  int64  
 1   TARGET_D  95412 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [11]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

# Beware of the dog!!! 
kbest = SelectKBest(chi2, k=10).fit_transform(X, y)
# Here we chose 10 so that is easier to analyze results later, as we will see
selected = pd.DataFrame(kbest)
selected.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,992.0,264.0,479.0,635.0,0.0,318.0,12883.0,240.0,95515.0
1,1.0,3611.0,940.0,5468.0,5218.0,4480.0,1096.0,36175.0,47.0,148535.0
2,1.0,7001.0,2040.0,497.0,546.0,0.0,292.0,11576.0,202.0,15078.0
3,0.0,640.0,160.0,1000.0,1263.0,9340.0,388.0,15130.0,109.0,172556.0
4,0.0,2520.0,627.0,576.0,594.0,5000.0,250.0,9836.0,254.0,7112.0


In [12]:
# To check the scores
model = SelectKBest(chi2, k=10).fit(X, y)
df =pd.DataFrame(data = model.scores_, columns = ['score'])
df['Column'] = numerical.columns
print(df.sort_values(by = ['score'], ascending = False).head(10))
print()

cols = df.sort_values(by = ['score'], ascending = False).head(10)['Column']
print(cols)

             score    Column
311  527716.426176  CONTROLN
140  187983.976667       IC5
83    49855.611718       HV1
84    49561.067003       HV2
0     39087.069814     TCODE
133   26891.429352       MSA
13    17167.230879    POP901
137    2921.367106       IC2
14     2811.233301    POP902
303    2756.199364  RAMNTALL

311    CONTROLN
140         IC5
83          HV1
84          HV2
0         TCODE
133         MSA
13       POP901
137         IC2
14       POP902
303    RAMNTALL
Name: Column, dtype: object


In [13]:
display(selected)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.0,992.0,264.0,479.0,635.0,0.0,318.0,12883.0,240.0,95515.0
1,1.0,3611.0,940.0,5468.0,5218.0,4480.0,1096.0,36175.0,47.0,148535.0
2,1.0,7001.0,2040.0,497.0,546.0,0.0,292.0,11576.0,202.0,15078.0
3,0.0,640.0,160.0,1000.0,1263.0,9340.0,388.0,15130.0,109.0,172556.0
4,0.0,2520.0,627.0,576.0,594.0,5000.0,250.0,9836.0,254.0,7112.0
...,...,...,...,...,...,...,...,...,...,...
95407,1.0,27380.0,7252.0,988.0,1025.0,380.0,481.0,18807.0,25.0,184568.0
95408,1.0,1254.0,322.0,1679.0,1723.0,3360.0,836.0,26538.0,20.0,122706.0
95409,1.0,552.0,131.0,376.0,377.0,4040.0,264.0,12178.0,58.0,189641.0
95410,0.0,1746.0,432.0,2421.0,2459.0,8735.0,544.0,15948.0,498.0,4693.0


In [14]:
display(numerical)

Unnamed: 0,TCODE,AGE,INCOME,WEALTH1,HIT,MALEMILI,MALEVET,VIETVETS,WWIIVETS,LOCALGOV,STATEGOV,FEDGOV,WEALTH2,POP901,POP902,POP903,POP90C1,POP90C2,POP90C3,POP90C4,POP90C5,ETH1,ETH2,ETH3,ETH4,ETH5,ETH6,ETH7,ETH8,ETH9,ETH10,ETH11,ETH12,ETH13,ETH14,ETH15,ETH16,AGE901,AGE902,AGE903,AGE904,AGE905,AGE906,AGE907,CHIL1,CHIL2,CHIL3,AGEC1,AGEC2,AGEC3,AGEC4,AGEC5,AGEC6,AGEC7,CHILC1,CHILC2,CHILC3,CHILC4,CHILC5,HHAGE1,HHAGE2,HHAGE3,HHN1,HHN2,HHN3,HHN4,HHN5,HHN6,MARR1,MARR2,MARR3,MARR4,HHP1,HHP2,DW1,DW2,DW3,DW4,DW5,DW6,DW7,DW8,DW9,HV1,HV2,HV3,HV4,HU1,HU2,HU3,HU4,HU5,HHD1,HHD2,HHD3,HHD4,HHD5,HHD6,HHD7,HHD8,HHD9,HHD10,HHD11,HHD12,ETHC1,ETHC2,ETHC3,ETHC4,ETHC5,ETHC6,HVP1,HVP2,HVP3,HVP4,HVP5,HVP6,HUR1,HUR2,RHP1,RHP2,RHP3,RHP4,HUPA1,HUPA2,HUPA3,HUPA4,HUPA5,HUPA6,HUPA7,RP1,RP2,RP3,RP4,MSA,ADI,DMA,IC1,IC2,IC3,IC4,IC5,IC6,IC7,IC8,IC9,IC10,IC11,IC12,IC13,IC14,IC15,IC16,IC17,IC18,IC19,IC20,IC21,IC22,IC23,HHAS1,HHAS2,HHAS3,HHAS4,MC1,MC2,MC3,TPE1,TPE2,TPE3,TPE4,TPE5,TPE6,TPE7,TPE8,TPE9,PEC1,PEC2,TPE10,TPE11,TPE12,TPE13,LFC1,LFC2,LFC3,LFC4,LFC5,LFC6,LFC7,LFC8,LFC9,LFC10,OCC1,OCC2,OCC3,OCC4,OCC5,OCC6,OCC7,OCC8,OCC9,OCC10,OCC11,OCC12,OCC13,EIC1,EIC2,EIC3,EIC4,EIC5,EIC6,EIC7,EIC8,EIC9,EIC10,EIC11,EIC12,EIC13,EIC14,EIC15,EIC16,OEDC1,OEDC2,OEDC3,OEDC4,OEDC5,OEDC6,OEDC7,EC1,EC2,EC3,EC4,EC5,EC6,EC7,EC8,SEC1,SEC2,SEC3,SEC4,SEC5,AFC1,AFC2,AFC3,AFC4,AFC5,AFC6,VC1,VC2,VC3,VC4,ANC1,ANC2,ANC3,ANC4,ANC5,ANC6,ANC7,ANC8,ANC9,ANC10,ANC11,ANC12,ANC13,ANC14,ANC15,POBC1,POBC2,LSC1,LSC2,LSC3,LSC4,VOC1,VOC2,VOC3,HC1,HC2,HC3,HC4,HC5,HC6,HC7,HC8,HC9,HC10,HC11,HC12,HC13,HC14,HC15,HC16,HC17,HC18,HC19,HC20,HC21,MHUC1,MHUC2,AC1,AC2,CARDPROM,NUMPROM,CARDPM12,NUMPRM12,RAMNTALL,NGIFTALL,CARDGIFT,MINRAMNT,MAXRAMNT,LASTGIFT,TIMELAG,AVGGIFT,CONTROLN,HPHONE_D,RFA_2F,CLUSTER2
0,0,60.000000,5,9,0,0,39,34,18,10,2,1,5,992,264,332,0,35,65,47,53,92,1,0,0,11,0,0,0,0,0,0,0,11,0,0,0,39,48,51,40,50,54,25,31,42,27,11,14,18,17,13,11,15,12,11,34,25,18,26,10,23,18,33,49,28,12,4,61,7,12,19,198,276,97,95,2,2,0,0,7,7,0,479,635,3,2,86,14,96,4,7,38,80,70,32,84,16,6,2,5,9,15,3,17,50,25,0,0,0,2,7,13,27,47,0,1,61,58,61,15,4,2,0,0,14,1,0,0,2,5,17,73,0.0,177.0,682.0,307,318,349,378,12883,13,23,23,23,15,1,0,0,1,4,25,24,26,17,2,0,0,2,28,4,51,1,46,54,3,88,8,0,0,0,0,0,0,4,1,13,14,16,2,45,56,64,50,64,44,62,53,99,0,0,9,3,8,13,9,0,3,9,3,15,19,5,4,3,0,3,41,1,0,7,13,6,5,0,4,9,4,1,3,10,2,1,7,78,2,0,120,16,10,39,21,8,4,3,5,20,3,19,4,0,0,0,18,39,0,34,23,18,16,1,4,0,23,0,0,5,1,0,0,0,0,0,2,0,3,74,88,8,0,4,96,77,19,13,31,5,14,14,31,54,46,0,0,90,0,10,0,0,0,33,65,40,99,99,6,2,10,7,27,74,6,14,240.0,31,14,5.0,12.0,10.0,4,7.741935,95515,0,4,39
1,1,46.000000,6,9,16,0,15,55,11,6,2,1,9,3611,940,998,99,0,0,50,50,67,0,0,31,6,4,2,6,4,14,0,0,2,0,1,4,34,41,43,32,42,45,32,33,46,21,13,14,33,23,10,4,2,11,16,36,22,15,12,1,5,4,21,75,55,23,9,69,4,3,24,317,360,99,99,0,0,0,0,0,0,0,5468,5218,12,10,96,4,97,3,9,59,94,88,55,95,5,4,1,3,5,4,2,18,44,5,0,0,0,97,98,98,98,99,94,0,83,76,73,21,5,0,0,0,4,0,0,0,91,91,91,94,4480.0,13.0,803.0,1088,1096,1026,1037,36175,2,6,2,5,15,14,13,10,33,2,5,2,5,15,14,14,10,32,6,2,66,3,56,44,9,80,14,0,0,0,0,0,0,6,0,2,24,32,12,71,70,83,58,81,57,64,57,99,99,0,22,24,4,21,13,2,1,6,0,4,1,0,3,1,0,6,13,1,2,8,18,11,4,3,4,10,7,11,1,6,2,1,16,69,5,2,160,5,5,12,21,7,30,20,14,24,4,24,10,0,0,0,8,15,0,55,10,11,0,0,2,0,3,1,1,2,3,1,1,0,3,0,0,0,42,39,50,7,27,16,99,92,53,5,10,2,26,56,97,99,0,0,0,96,0,4,0,0,0,99,0,99,99,99,20,4,6,5,12,32,6,13,47.0,3,1,10.0,25.0,25.0,18,15.666667,148535,0,2,1
2,1,61.611649,3,1,2,0,20,29,33,6,8,1,1,7001,2040,2669,0,2,98,49,51,96,2,0,0,2,0,0,0,0,0,0,0,2,0,0,0,35,43,46,37,45,49,23,35,40,25,13,20,19,16,13,10,8,15,14,30,22,19,25,10,23,21,35,44,22,6,2,63,9,9,19,183,254,69,69,1,6,5,3,3,3,0,497,546,2,1,78,22,93,7,18,36,76,65,30,86,14,7,2,5,11,17,3,17,60,18,0,1,0,0,1,6,18,50,0,4,36,49,51,14,5,4,2,24,11,2,3,6,0,2,9,44,0.0,281.0,518.0,251,292,292,340,11576,32,18,20,15,12,2,0,0,1,20,19,24,18,16,2,0,0,1,28,8,31,11,38,62,8,74,22,0,0,0,0,0,2,2,1,21,19,24,6,61,65,73,59,70,56,78,62,82,99,4,10,5,2,6,12,0,1,9,5,18,20,5,7,6,0,11,33,4,3,2,12,3,3,2,0,7,8,3,3,6,7,1,8,74,3,1,120,22,20,28,16,6,5,3,1,23,1,16,6,0,0,0,10,21,0,28,23,32,8,1,14,1,5,0,0,7,0,0,0,0,0,1,0,0,2,84,96,3,0,0,92,65,29,9,22,3,12,23,50,69,31,0,0,0,6,35,44,0,15,22,77,17,97,92,9,2,6,5,26,63,6,14,202.0,27,14,2.0,16.0,5.0,12,7.481481,15078,1,4,60
3,0,70.000000,1,4,2,0,23,14,31,3,0,3,0,640,160,219,0,8,92,54,46,61,0,0,11,32,6,2,0,0,0,0,0,31,0,0,1,32,40,44,34,43,47,25,45,35,20,15,25,17,17,12,7,7,20,17,30,14,19,25,11,23,23,27,50,30,15,8,63,9,6,23,199,283,85,83,3,4,1,0,2,0,2,1000,1263,2,1,48,52,93,7,6,36,73,61,30,84,16,6,3,3,21,12,4,13,36,13,0,0,0,10,25,50,69,92,10,15,42,55,50,15,5,4,0,9,42,4,0,5,1,8,17,34,9340.0,67.0,862.0,386,388,396,423,15130,27,12,4,26,22,5,0,0,4,35,5,6,12,30,6,0,0,5,22,14,26,20,46,54,3,58,36,0,0,0,0,0,6,0,0,17,13,15,0,43,69,81,53,68,45,33,31,0,99,23,17,3,0,6,6,0,0,13,42,12,0,0,0,42,0,6,3,0,0,0,23,3,3,6,0,3,3,3,3,3,0,3,6,87,0,0,120,28,12,14,27,10,3,5,0,19,1,17,0,0,0,0,13,23,0,14,40,31,16,0,1,0,13,0,0,4,0,0,0,3,0,0,0,0,29,67,56,41,3,0,94,43,27,4,38,0,10,19,39,45,55,0,0,45,22,17,0,0,16,23,77,22,93,89,16,2,6,6,27,66,6,14,109.0,16,7,2.0,11.0,10.0,9,6.812500,172556,1,4,41
4,0,78.000000,3,2,60,1,28,9,53,26,3,2,9,2520,627,761,99,0,0,46,54,2,98,0,0,1,0,0,0,0,0,0,0,0,0,0,0,33,45,50,36,46,50,27,34,43,23,14,21,13,15,20,12,5,13,15,34,19,19,31,7,27,16,26,57,36,24,14,42,17,9,33,235,323,99,98,0,0,0,0,0,0,0,576,594,4,3,90,10,97,3,0,42,82,49,22,92,8,20,3,17,9,23,1,1,1,0,21,58,19,0,1,2,16,67,0,2,45,52,53,16,6,0,0,0,9,0,0,0,25,58,74,83,5000.0,127.0,528.0,240,250,293,321,9836,24,29,23,13,4,4,0,0,2,21,30,22,16,4,5,0,0,3,35,8,11,14,20,80,4,73,22,1,1,0,0,0,3,1,2,1,24,27,3,76,61,73,51,65,49,80,31,81,99,10,17,8,2,6,15,3,7,22,2,9,0,7,2,2,0,6,1,5,2,2,12,2,7,6,4,15,29,4,3,26,3,2,7,49,12,1,120,16,20,30,13,3,12,5,2,26,1,20,7,1,1,1,15,28,4,9,16,53,20,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,65,99,0,0,0,90,45,18,25,34,0,1,3,6,33,67,0,0,9,14,72,3,0,0,99,1,21,99,96,6,2,7,11,43,113,10,25,254.0,37,8,3.0,15.0,15.0,14,6.864865,7112,1,2,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95407,1,61.611649,5,9,0,14,36,47,11,7,8,13,9,27380,7252,10037,99,0,0,50,50,78,10,6,4,5,0,0,0,1,1,0,0,3,1,0,2,28,35,38,29,38,41,30,45,37,18,16,31,25,15,8,3,1,20,18,31,18,13,7,3,5,20,32,48,28,10,4,58,15,3,24,195,271,54,38,8,32,24,14,0,0,0,988,1025,6,6,56,44,89,11,3,44,72,56,32,83,17,12,3,10,16,15,8,19,55,5,3,6,0,2,10,49,73,92,0,4,40,52,53,15,4,24,8,13,14,15,12,3,69,84,92,97,380.0,0.0,743.0,433,481,499,535,18807,11,13,13,21,22,13,4,2,2,9,11,11,21,24,16,4,2,2,9,6,70,6,63,37,27,76,15,2,2,0,0,0,5,2,1,2,18,20,2,69,81,89,73,83,69,69,57,61,94,7,15,16,5,10,21,0,3,11,1,11,2,3,3,1,4,6,4,7,3,3,17,7,5,3,1,9,8,7,14,7,8,13,6,59,7,0,136,2,7,28,33,8,15,8,3,26,2,19,8,8,15,2,20,35,5,48,15,11,25,1,5,1,9,0,0,4,1,1,1,0,0,1,1,0,4,26,92,3,2,4,95,60,19,3,14,0,7,32,78,91,9,6,5,86,1,12,0,0,1,93,7,98,99,98,16,4,4,3,6,14,5,12,25.0,1,0,25.0,25.0,25.0,9,25.000000,184568,0,1,12
95408,1,48.000000,7,9,1,0,31,43,19,4,1,0,9,1254,322,361,96,0,4,51,49,91,3,0,2,6,1,0,1,0,0,0,0,5,0,0,1,30,40,40,28,41,43,39,33,42,25,9,19,43,17,7,4,2,10,16,35,23,16,9,2,7,10,20,70,52,25,6,73,4,2,20,307,346,89,88,1,1,0,0,0,0,0,1679,1723,3,3,88,12,97,3,0,63,89,85,60,96,4,2,1,1,7,5,1,28,58,5,2,2,0,18,71,88,91,97,5,1,77,82,75,20,4,1,0,10,7,1,0,5,16,26,44,79,3360.0,201.0,618.0,806,836,802,849,26538,8,9,7,6,11,29,13,2,15,10,0,8,2,13,35,16,3,13,8,5,61,7,83,17,36,80,4,4,4,0,0,0,6,5,3,3,25,32,10,61,73,88,56,87,52,48,43,99,0,0,18,31,0,13,17,0,1,2,4,6,0,3,5,1,8,8,9,3,7,9,13,9,6,0,0,4,7,13,3,4,1,0,4,78,12,0,160,1,6,12,24,7,36,14,9,35,5,32,7,0,0,0,21,31,8,43,5,19,15,1,12,1,14,0,0,4,0,0,1,0,0,0,1,0,2,51,94,3,0,2,99,84,29,4,7,2,55,90,94,94,6,0,0,82,2,16,0,0,0,69,31,67,99,97,18,5,3,2,4,10,3,8,20.0,1,0,20.0,20.0,20.0,9,20.000000,122706,1,1,2
95409,1,60.000000,5,9,0,0,18,46,20,7,23,0,9,552,131,205,99,0,0,53,47,82,14,0,1,9,0,0,0,0,0,0,0,9,0,0,0,28,35,37,30,41,44,32,46,38,17,13,34,21,9,9,9,4,21,17,32,20,10,18,7,17,27,29,44,31,14,5,45,19,5,31,179,268,96,95,1,2,1,0,0,0,0,376,377,4,3,66,34,95,5,10,37,64,43,21,80,20,16,2,14,21,20,9,20,49,12,7,7,1,0,0,0,1,9,0,2,45,51,54,14,5,2,0,0,31,2,0,0,3,34,78,91,4040.0,61.0,551.0,263,264,319,345,12178,21,26,20,18,12,0,3,0,0,26,18,17,11,21,0,6,0,0,10,13,26,26,43,57,3,83,17,0,0,0,0,0,0,0,0,25,17,17,0,69,69,70,69,70,69,77,24,62,0,25,5,13,9,5,22,0,2,14,0,13,9,5,2,0,0,4,14,3,11,0,10,5,2,0,5,6,19,3,19,7,23,0,0,52,18,0,120,5,3,51,23,7,11,0,6,32,4,27,7,0,0,0,9,18,0,46,0,20,20,2,8,0,14,0,0,0,1,0,0,0,0,1,0,0,6,82,92,5,3,0,93,42,12,6,51,0,0,0,0,0,99,0,0,97,0,0,0,0,4,99,0,99,99,99,5,2,3,11,14,33,7,17,58.0,7,4,3.0,10.0,10.0,3,8.285714,189641,1,3,34
95410,0,58.000000,7,9,0,0,28,35,20,9,1,1,7,1746,432,508,99,0,0,47,53,92,1,1,5,8,0,1,2,0,1,0,0,5,0,0,3,34,42,45,36,45,49,25,38,40,22,12,21,21,18,12,7,9,13,16,34,20,17,20,4,16,9,26,65,41,17,6,56,9,8,27,262,324,99,99,0,0,0,0,5,4,1,2421,2459,11,10,88,12,99,1,0,44,85,71,36,84,16,8,2,6,9,12,6,19,56,16,0,0,0,89,96,99,99,99,9,0,90,65,68,18,5,0,0,0,12,0,0,0,88,88,90,91,8735.0,13.0,803.0,552,544,568,556,15948,7,4,11,18,38,15,5,3,0,4,6,15,19,38,13,4,3,0,25,2,46,3,43,57,9,80,11,0,0,0,0,1,2,6,0,24,18,28,11,52,73,88,60,85,57,70,54,99,99,0,14,16,6,16,17,0,2,12,1,11,2,0,2,1,0,2,22,4,6,4,19,4,7,2,4,6,7,9,4,9,1,1,7,72,8,2,140,7,6,20,35,12,15,5,6,29,4,21,10,0,0,0,13,28,1,35,18,20,8,0,3,1,9,0,0,2,6,1,2,0,0,0,0,0,14,50,83,8,4,5,99,85,43,9,25,0,0,6,17,99,1,0,0,99,0,1,0,0,0,99,0,99,99,99,12,3,6,3,36,127,9,31,498.0,41,18,5.0,21.0,18.0,4,12.146341,4693,1,4,11


# Lesson 2

## Recursive feature elimination

In [15]:
from sklearn.feature_selection import RFE
from sklearn import linear_model

# We can use any model, LM, K-NN,...
# For classification problems, the metric used to compare models is Accuracy
# For regression problems, the metric used to compare models is RMSE
#lm = linear_model.LinearRegression()
#rfe = RFE(lm, n_features_to_select=20, verbose=False)
#rfe.fit(X, y)

KeyboardInterrupt: 

In [None]:
# After we run the algorithm, it labels the top features as 1 and the rest are marked in an increasing order of importance. 
df = pd.DataFrame(data = rfe.ranking_, columns=['Rank'])
df['Column_name'] = X.columns
df[df['Rank']==1]

## Activity 2

# Lesson 3

# Lesson 4

## Hypothesis testing

We want to test if our the **sample mean** is not equal to the **population mean** = 80.94. We also know that our **sample** has a size of 25 individuals.

$t = \frac{(\bar{X}-\mu)}{\hat{\sigma}/\sqrt{n}}$

where:

* $\bar{X}$ is the **sample mean**
* $\mu$ is the **population mean**
* $\hat{\sigma}$ is the **sample standard deviation**
* $n$ is the number of measures in our sample

In [16]:
import math

sample_mean = 80.94
pop_mean = 85
sample_std = 11.6
n = 25
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))
print("Statistic is: ", statistic)

Statistic is:  -1.750000000000001


In [17]:
from scipy import stats
from numpy.random import normal


samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 80.94, scale = 11.6, size = 25)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))


The t-statistic for the sample 0 is: 0.3376099975615978
The t-statistic for the sample 1 is: -0.8190004859471183
The t-statistic for the sample 2 is: -1.010376051572296
The t-statistic for the sample 3 is: -3.020069246610508
The t-statistic for the sample 4 is: -2.485459597454204
The t-statistic for the sample 5 is: -1.6049801727534176
The t-statistic for the sample 6 is: -0.5963748183060341
The t-statistic for the sample 7 is: -3.262474844084132
The t-statistic for the sample 8 is: -0.9771684709146745
The t-statistic for the sample 9 is: -2.5392166408218877


Now that we have the t-statistic for each random sample, let's make the two tails test. Why two tails? Because we are looking what is the probability that we get a **sample mean** which deviates from the **population mean** more than out t-statistic. We don't care if the our **sample mean** is bigger or smaller than the **population mean**.

Therefore, we can ask ourselves what is the probability of having a deviation within -t and t.


In [18]:
print("Assuming a significance level of 0.05")
print()

for i in range(10):
    sample_name = "sample_" + str(i)
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],85)[1]))
    if ( stats.ttest_1samp(samples[sample_name],85)[1] < 0.05 ):
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()

Assuming a significance level of 0.05

The p-value of sample 0 is: 0.739

The p-value of sample 1 is: 0.421

The p-value of sample 2 is: 0.322

The p-value of sample 3 is: 0.00592
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 3 given Ho.

The p-value of sample 4 is: 0.0203
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 0.122

The p-value of sample 6 is: 0.557

The p-value of sample 7 is: 0.0033
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 7 given Ho.

The p-value of sample 8 is: 0.338

The p-value of sample 9 is: 0.018
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 9 given Ho.



# 7.04 Activity 1
    The datasets are provided in the files_for_activities folder.

    For the example covered in the lab for the last lesson, using the test statistics and then using the p-value, do you reject the null hypothesis or you fail to reject the null hypothesis?

    Here is the question again for reference: It is assumed that the mean systolic blood pressure is μ = 120 mm Hg. In the Honolulu Heart Study, a sample of n = 100 people had an average systolic blood pressure of 130.1 mm Hg with a standard deviation of 21.21 mm Hg. Is the group significantly different (with respect to systolic blood pressure!) from the regular population?

    Set up the hypothesis test.

H0 = No difference between the means
Ha = Difference between the means



In [19]:
sample_mean = 130.1
pop_mean = 120
sample_std = 21.21
n = 100
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))
print("Statistic is: ", statistic)

Statistic is:  4.761904761904759


In [20]:
samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 130.1, scale = 21.21, size = 100)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))

The t-statistic for the sample 0 is: 5.98462689354867
The t-statistic for the sample 1 is: 5.85252530073868
The t-statistic for the sample 2 is: 4.621378423964536
The t-statistic for the sample 3 is: 4.0456431527584735
The t-statistic for the sample 4 is: 3.9285683301612733
The t-statistic for the sample 5 is: 4.26381491926478
The t-statistic for the sample 6 is: 4.765136462820944
The t-statistic for the sample 7 is: 3.1190040623282207
The t-statistic for the sample 8 is: 4.784422357298407
The t-statistic for the sample 9 is: 4.9237139602392395


In [21]:
print("Assuming a significance level of 0.05")
print()

for i in range(10):
    sample_name = "sample_" + str(i)
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],120)[1]))
    if ( stats.ttest_1samp(samples[sample_name],120)[1] < 0.05 ):
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()

Assuming a significance level of 0.05

The p-value of sample 0 is: 3.48e-08
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 0 given Ho.

The p-value of sample 1 is: 6.29e-08
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 1 given Ho.

The p-value of sample 2 is: 1.15e-05
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 2 given Ho.

The p-value of sample 3 is: 0.000104
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 3 given Ho.

The p-value of sample 4 is: 0.000158
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 4.6e-05
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 5 given Ho.

The p-value of sample 6 is: 6.48e-06
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 6 given Ho.

The p-value of sample 7 is: 0.00238
Therefore

# 7.04 Activity 3

List down all the steps in conducting a hypothesis test, and see if you are able to set up the test yourself. You are free to make any assumptions in your test as you would like.

Elements needed:

    > Null hypothesis
    > Alternate hypothesis
    > Level of significance
    > Test statistic
    > P-value

Assuming that: samples are drawns from a population which is normally distributed

    Step 1: Defining the null hypothesis - This is our assumption about the population. It is defined by H0 and in this case H0: mu = X
    Step 2: Defining the Alternate hypothesis - This means what if our assumption is not true. It is defined by Ha and in this case Ha: mu != X
    Step 3: Decide a test statistics based on the information available (t-test or z-test)
    Step 4: Level of significance: This defines the rejection region / critical region. It is defined by the greek letter 'alpha'. Usually it is 0.05.

In [22]:
sample_mean1 = 105.5
sample_std1 = 20.1
n1 = 34
sample_mean2 = 90.9
sample_std2 = 12.2
n2 = 29
pooled_sample_std = math.sqrt(((n1-1)*sample_std1**2 + (n2-1)*sample_std2**2)/(n1+n2-2))
statistic = (sample_mean1-sample_mean2)/(pooled_sample_std*math.sqrt((1/n1)+(1/n2)))
print("T Statistic is: ", statistic)

T Statistic is:  3.4101131776909535


In [23]:
t = statistic

In [24]:
print(t)

3.4101131776909535


In [25]:
from scipy.stats import t

In [26]:
# Using python to find the p value and critical value
print("P value is: ", 1- t.cdf(statistic,n1+n2-2))
print("Critical Value of z is: ", t.ppf(0.025, n1+n2-2)) #alpha is 0.05

P value is:  0.0005783712704483523
Critical Value of z is:  -1.9996235841149783


# Lab | Inferential statistics
    Instructions
    It is assumed that the mean systolic blood pressure is μ = 120 mm Hg. In the Honolulu Heart Study, a sample of n = 100 people had an average systolic blood pressure of 130.1 mm Hg with a standard deviation of 21.21 mm Hg. Is the group significantly different (with respect to systolic blood pressure!) from the regular population?

    > Set up the hypothesis test.
    > Write down all the steps followed for setting up the test.
    > Calculate the test statistic by hand and also code it in Python. It should be 4.76190. We will take a look at how to make decisions based on this calculated value.
    If you finished the previous question, please go through the code for principal_component_analysis_example provided in the files_for_lab folder .

## Set up the hypothesis test
    H0 = no significant difference between the groups
    Ha = significant difference between the groups

## Write down all the steps followed for setting up the test.

Elements needed:

    > Null hypothesis
    > Alternate hypothesis
    > Level of significance
    > Test statistic
    > P-value

Assuming that: samples are drawns from a population which is normally distributed

    Step 1: Defining the null hypothesis - This is our assumption about the population. It is defined by H0 and in this case H0: mu = X
    Step 2: Defining the Alternate hypothesis - This means what if our assumption is not true. It is defined by Ha and in this case Ha: mu != X
    Step 3: Decide a test statistics based on the information available (t-test or z-test)
    Step 4: Level of significance: This defines the rejection region / critical region. It is defined by the greek letter 'alpha'. Usually it is 0.05.

## Calculate the test statistic by hand and also code it in Python. It should be 4.76190. We will take a look at how to make decisions based on this calculated value.

In [27]:
sample_mean = 130.1
pop_mean = 120
sample_std = 21.21
n = 100
statistic = (sample_mean - pop_mean)/(sample_std/math.sqrt(n))
print("Statistic is: ", statistic)

Statistic is:  4.761904761904759


In [28]:
samples = {}

for i in range(10):
    sample_name = "sample_" + str(i)
    samples[sample_name] = normal(loc = 130.1, scale = 21.21, size = 100)
    sample_mean = "sample_" + str(i) + "_mean"
    samples[sample_mean] = np.mean(samples[sample_name])
    sample_std = "sample_" + str(i) + "_std"
    samples[sample_std] = np.std(samples[sample_name],ddof=1)
    sample_statistic = "sample_" + str(i) + "_t-statistic"
    samples[sample_statistic] = (samples[sample_mean]- pop_mean)/(samples[sample_std]/math.sqrt(n)) 
    print("The t-statistic for the sample {} is: {}".format(i,samples[sample_statistic]))

The t-statistic for the sample 0 is: 3.30597656469128
The t-statistic for the sample 1 is: 5.104366405444165
The t-statistic for the sample 2 is: 5.153408939246276
The t-statistic for the sample 3 is: 3.8652457468721506
The t-statistic for the sample 4 is: 5.74694577988679
The t-statistic for the sample 5 is: 5.423931285230123
The t-statistic for the sample 6 is: 8.054129460504724
The t-statistic for the sample 7 is: 4.998613137925215
The t-statistic for the sample 8 is: 5.111089546774605
The t-statistic for the sample 9 is: 4.245643414506095


In [30]:
print("Assuming a significance level of 0.05")
print()

for i in range(10):
    sample_name = "sample_" + str(i)
    print("The p-value of sample {} is: {:-5.3}".format(i,stats.ttest_1samp(samples[sample_name],120)[1]))
    if ( stats.ttest_1samp(samples[sample_name],120)[1] < 0.05 ):
        print("Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample {} given Ho.".format(i))
    print()

Assuming a significance level of 0.05

The p-value of sample 0 is: 0.00132
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 0 given Ho.

The p-value of sample 1 is: 1.61e-06
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 1 given Ho.

The p-value of sample 2 is: 1.31e-06
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 2 given Ho.

The p-value of sample 3 is: 0.000198
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 3 given Ho.

The p-value of sample 4 is: 1.01e-07
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 4 given Ho.

The p-value of sample 5 is: 4.13e-07
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 5 given Ho.

The p-value of sample 6 is: 1.84e-12
Therefore we discard the null hypothesis Ho, as it's very unlikely to get sample 6 given Ho.

The p-value of sample 7 is: 2.5e-06
Therefore

# Lab | Inferential statistics - T-test & P-value
    Instructions
    1 - We will have another simple example on two sample t test (pooled- when the variances are equal). But this time this is a one sided t-test
    In a packing plant, a machine packs cartons with jars. It is supposed that a new machine will pack faster on the average than the machine currently used. To test that hypothesis, the times it takes each machine to pack ten cartons are recorded. The results, in seconds, are shown in the tables in the file files_for_lab/machine.txt. Assume that there is sufficient evidence to conduct the t test, does the data provide sufficient evidence to show if one machine is better than the other

## Hypothesis Test

Do the data provide sufficient evidence to conclude that, on the average, the new machine packs faster? Perform the required hypothesis test at the 5% level of significance using the rejection region approach.

In [45]:
data = pd.read_csv('machine2.txt', sep="\t")

In [46]:
display(data)

Unnamed: 0,New machine,Old machine
0,42.1,42.7
1,41.0,43.6
2,41.3,43.8
3,41.8,43.3
4,42.4,42.5
5,42.8,43.5
6,43.2,43.1
7,42.3,41.7
8,41.8,44.0
9,42.7,44.1


In [47]:
from scipy.stats import ttest_ind

In [51]:
stats.ttest_ind(data['New machine'], data['Old machine'])

Ttest_indResult(statistic=-3.3972307061176026, pvalue=0.0032111425007745158)

In [75]:
sample_mean1 = np.mean(data['New machine'])
sample_std1 = np.std(data['New machine'])
n1 = 10
sample_mean2 = np.mean(data['Old machine'])
sample_std2 = np.std(data['Old machine'])
n2 = 10

print("The mean of Sample 1:", round(sample_mean1,2))
print("The std of Sample 1:", round(sample_std1,2))
print("The mean of Sample 2:", round(sample_mean2,2))
print("The std of Sample 2:", round(sample_std2, 2))

The mean of Sample 1: 42.14
The std of Sample 1: 0.65
The mean of Sample 2: 43.23
The std of Sample 2: 0.71


In [76]:
pooled_sample_std = math.sqrt(((n1-1)*sample_std1**2 + (n2-1)*sample_std2**2)/(n1+n2-2))
statistic = (sample_mean1-sample_mean2)/(pooled_sample_std*math.sqrt((1/n1)+(1/n2)))

print("Pooled Standard Deviation is:", pooled_sample_std)
print("T Statistic is: ", statistic)

Pooled Standard Deviation is: 0.6806247130394253
T Statistic is:  -3.5809955894645813


In [71]:
# Using python to find the p value and critical value
print("P value is: ", 1- t.cdf(statistic,n1+n2-2))
print("Critical Value of z is: ", t.ppf(0.025, n1+n2-2)) #alpha is 0.05

P value is:  0.9989322065983063
Critical Value of z is:  -2.10092204024096


### Answer
    Are these independent samples? Yes, since the samples from the two machines are not related.

    Are these large samples or a normal population?

    We have  n1 < 30 and n2 < 30. We do not have large enough samples, and thus we need to check the normality assumption from both populations. Let's take a look at the normality plots for this data:

    From the normal probability plots, we conclude that both populations may come from normal distributions. Remember the plots do not indicate that they DO come from a normal distribution. It only shows if there are clear violations. We should proceed with caution.

    Do the populations have equal variance? No information allows us to assume they are equal. We can use our rule of thumb to see if they are “close.” They are not that different as s1/s2 = 0.683/0.750  is quite close to 1. This assumption does not seem to be violated.

    We can thus proceed with the pooled t-test.

    Let  denote the mean for the new machine and  denote the mean for the old machine.

    The null hypothesis is that there is no difference in the two population means, i.e.

    The alternative is that the new machine is faster, i.e.

    The significance level is 5%. Since we may assume the population variances are equal, we first have to calculate the pooled standard deviation:

    The test statistic is:

    The alternative is left-tailed so the critical value is the value  such that , with  degrees of freedom. The critical value is -1.7341. The rejection region is .

    Our test statistic, -3.3978, is in our rejection region, therefore, we reject the null hypothesis. With a significance level of 5%, we reject the null hypothesis and conclude there is enough evidence to suggest that the new machine is faster than the old machine.

![title](https://online.stat.psu.edu/stat500/sites/stat500/files/inline-images/500%20l7%20ex%20pack%20machines%20normality%20m1.png)

![title](https://online.stat.psu.edu/stat500/sites/stat500/files/inline-images/500%20l7%20compare%20pack%20normality%20plot%20of%20old%20mach.png)

https://online.stat.psu.edu/stat500/lesson/7/7.3/7.3.1/7.3.1.1

### Exercise 2

    2 - An additional problem (not mandatory): In this case we can't assume that the population variances are equal. Hence in this case we cannot pool the variances. Independent random samples of 17 sophomores and 13 juniors attending a large university yield the following data on grade point averages. Data is provided in the file student_gpa.txt. At the 5% significance level, do the data provide sufficient evidence to conclude that the mean GPAs of sophomores and juniors at the university differ?

    Test statistics can be calculated as: link to the image - Test statistics calculation for Unpooled Variance Case

    Degrees of freedom is (n1-1)+(n2-1).

Independent random samples of 17 sophomores and 13 juniors attending a large university yield the following data on grade point averages (student_gpa.txt): [1]

    Sophomores
    3.04	2.92	2.86	1.71	3.60
    3.49	3.30	2.28	3.11	2.88
    2.82	2.13	2.11	3.03	3.27
    2.60	3.13	
    
    Juniors
    2.56	3.47	2.65	2.77	3.26
    3.00	2.70	3.20	3.39	3.00
    3.19	2.58	2.98	 	 

In [80]:
data2 = pd.read_csv('student_gpa2.txt', sep="\t")

In [81]:
display(data2)

Unnamed: 0,Sophomores,Juniors
0,3.04,2.56
1,1.71,2.77
2,3.3,2.7
3,2.88,3.0
4,2.11,2.98
5,2.6,3.47
6,2.92,3.26
7,3.6,3.2
8,2.28,3.19
9,2.82,2.65


In [83]:
data2['Sophomores']

0     3.04
1     1.71
2     3.30
3     2.88
4     2.11
5     2.60
6     2.92
7     3.60
8     2.28
9     2.82
10    3.03
11    3.13
12    2.86
13    3.49
14    3.11
15    2.13
16    3.27
Name: Sophomores, dtype: float64

In [86]:
data2['Juniors'][:13]

0     2.56
1     2.77
2     2.70
3     3.00
4     2.98
5     3.47
6     3.26
7     3.20
8     3.19
9     2.65
10    3.00
11    3.39
12    2.58
Name: Juniors, dtype: float64

In [87]:
stats.ttest_ind(data2['Sophomores'], data2['Juniors'][:13])

Ttest_indResult(statistic=-0.864325455323425, pvalue=0.39475359666695975)

In [88]:
sample_mean3 = np.mean(data2['Sophomores'])
sample_std3 = np.std(data2['Sophomores'])
n3 = 17
sample_mean4 = np.mean(data2['Juniors'][:13])
sample_std4 = np.std(data2['Juniors'][:13])
n4 = 13

print("The mean of Sophomores:", round(sample_mean3,2))
print("The std of Sophomores:", round(sample_std3,2))
print("The mean of Juniors:", round(sample_mean4,2))
print("The std of Juniors:", round(sample_std4, 2))

The mean of Sophomores: 2.84
The std of Sophomores: 0.5
The mean of Juniors: 2.98
The std of Juniors: 0.3


In [89]:
pooled_sample_std = math.sqrt(((n3-1)*sample_std3**2 + (n4-1)*sample_std4**2)/(n3+n4-2))
statistic = (sample_mean3-sample_mean4)/(pooled_sample_std*math.sqrt((1/n3)+(1/n4)))

print("Pooled Standard Deviation is:", pooled_sample_std)
print("T Statistic is: ", statistic)

Pooled Standard Deviation is: 0.42798025956891655
T Statistic is:  -0.892728724944096


In [71]:
# Using python to find the p value and critical value
print("P value is: ", 1- t.cdf(statistic,n1+n2-2))
print("Critical Value of z is: ", t.ppf(0.025, n1+n2-2)) #alpha is 0.05

P value is:  0.9989322065983063
Critical Value of z is:  -2.10092204024096


In [None]:
stats.ttest_ind(data['New machine'], data['Old machine'])

sample_mean1 = np.mean(data['New machine'])
sample_std1 = np.std(data['New machine'])
n1 = 10
sample_mean2 = np.mean(data['Old machine'])
sample_std2 = np.std(data['Old machine'])
n2 = 10

print("The mean of Sample 1:", round(sample_mean1,2))
print("The std of Sample 1:", round(sample_std1,2))
print("The mean of Sample 2:", round(sample_mean2,2))
print("The std of Sample 2:", round(sample_std2, 2))

pooled_sample_std = math.sqrt(((n1-1)*sample_std1**2 + (n2-1)*sample_std2**2)/(n1+n2-2))
statistic = (sample_mean1-sample_mean2)/(pooled_sample_std*math.sqrt((1/n1)+(1/n2)))

print("Pooled Standard Deviation is:", pooled_sample_std)
print("T Statistic is: ", statistic)

# Using python to find the p value and critical value
print("P value is: ", 1- t.cdf(statistic,n1+n2-2))
print("Critical Value of z is: ", t.ppf(0.025, n1+n2-2)) #alpha is 0.05

### Answer
    The response variable is GPA and is quantitative. The explanatory variable is class standing (sophomores or juniors) is categorical. The two populations are independent. Since we don't have large samples from both populations, we need to check the normal probability plots of the two samples:
    Normal Probability Plot of Sophomores
    Normality plot of the grade point averages of the sophomores.

    Normal Probability Plot of Juniors
    Normality plot of grade point averages of the juniors.

    There is no indication that there is a violation of the normal assumption for both samples. As before, we should proceed with caution.

    Now, we need to determine whether to use the pooled t-test or the non-pooled (separate variances) t-test. The summary statistics are:

    The standard deviations are 0.520 and 0.3093 respectively; both the sample sizes are small, and the standard deviations are quite different from each other. We, therefore, decide to use an unpooled t-test.

    The null and alternative hypotheses are:

    H0: μ1−μ2=0 vs Ha μ1−μ2 ≠ 0
    The significance level is 5%. Perform the 2-sample t-test in Minitab with the appropriate alternative hypothesis.

    Remember, the default for the 2-sample t-test in Minitab is the non-pooled one. Minitab generates the following output.

    Two sample T for sophomores vs juniors
        N	Mean	StDev	SE Mean
    sophomore	17	2.840	0.52	0.13
    junior	13	2.981	0.309	0.086
    95% CI for mu sophomore - mu juniors: (-0.45, 0.173)

    T-Test mu sophomore = mu juniors (Vs no =): T = -0.92

    P = 0.36 DF = 26

    Since the p-value of 0.36 is larger than α = 0.05, we fail to reject the null hypothesis.

    At 5% level of significance, the data does not provide sufficient evidence that the mean GPAs of sophomores and juniors at the university are different.

    Find a 95% confidence interval for the difference between the mean GPA of Sophomores and the mean GPA of Juniors using Minitab.
    Answer
    Minitab output:
    95% CI for mu sophomore- mu juniors is;
    (-0.45, 0.173)

    Interpreting the result:
    We are 95% confident that the difference between the mean GPA of sophomores and juniors is between -0.45 and 0.173.

    Note! When entering values into the 'Samples in different columns' input boxes, Minitab always subtracts the second value (column entered second) from the first value (column entered first).

![title](https://online.stat.psu.edu/stat500/sites/stat500/files/inline-images/500%20l7%20ex%20normality%20plot%20of%20sophomores.png)

![title](https://online.stat.psu.edu/stat500/sites/stat500/files/inline-images/500%20l7%20normality%20plot%20of%20juniors%20gpa.png)

https://online.stat.psu.edu/stat500/book/export/html/581