In [1]:
# load data from woolridge
# DataSet BWGHT contains 1388 observations on birth weight and other characteristics of babies born in North Carolina in 1988.
#   1. faminc                   1988 family income, $1000s
#   2. cigtax                   cig. tax in home state, 1988
#   3. cigprice                 cig. price in home state, 1988
#   4. bwght                    birth weight, ounces
#   5. fatheduc                 father's yrs of educ
#   6. motheduc                 mother's yrs of educ
#   7. parity                   birth order of child
#   8. male                     =1 if male child
#   9. white                    =1 if white
#  10. cigs                     cigs smked per day while preg
#  11. lbwght                   log of bwght
#  12. bwghtlbs                 birth weight, pounds
#  13. packs                    packs smked per day while preg
#  14. lfaminc                  log(faminc)

import wooldridge

df = wooldridge.data('BWGHT')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1388 entries, 0 to 1387
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   faminc    1388 non-null   float64
 1   cigtax    1388 non-null   float64
 2   cigprice  1388 non-null   float64
 3   bwght     1388 non-null   int64  
 4   fatheduc  1192 non-null   float64
 5   motheduc  1387 non-null   float64
 6   parity    1388 non-null   int64  
 7   male      1388 non-null   int64  
 8   white     1388 non-null   int64  
 9   cigs      1388 non-null   int64  
 10  lbwght    1388 non-null   float64
 11  bwghtlbs  1388 non-null   float64
 12  packs     1388 non-null   float64
 13  lfaminc   1388 non-null   float64
dtypes: float64(9), int64(5)
memory usage: 151.9 KB


In [2]:
# Find the average and standard deviation of fatheduc and motheduc and calculate the Z scores values of 18 (years) for fatheduc and motheduc.
# Z score is a numerical measurement that describes a value's relationship to the mean of a group of values.
# Z score is measured in terms of standard deviations from the mean.

fatheduc_average = df['fatheduc'].mean()
fatheduc_std = df['fatheduc'].std()
fatheduc_zscore = (18 - fatheduc_average) / fatheduc_std


motheduc_average = df['motheduc'].mean()
motheduc_std = df['motheduc'].std()
motheduc_zscore = (18 - motheduc_average) / motheduc_std

print(f"fatheduc_average: {fatheduc_average}, fatheduc_std: {fatheduc_std}, fatheduc_zscore: {fatheduc_zscore}")
print(f"motheduc_average: {motheduc_average}, motheduc_std: {motheduc_std}, motheduc_zscore: {motheduc_zscore}")

fatheduc_average: 13.186241610738255, fatheduc_std: 2.745984682114012, fatheduc_zscore: 1.7530172038526615
motheduc_average: 12.935832732516221, motheduc_std: 2.3767283698410244, motheduc_zscore: 2.130730348383275


In [3]:
# Perform T-test on average education of fatheduc and motheduc.

# A t test is a statistical test that is used to compare the means of two groups. 
# A t test can only be used when comparing the means of two groups 

# What type of t-test should we use?
#   1. One sample t-test: compare the mean of a sample to a known value
#   2. Independent samples t-test: compare the means of two unrelated groups
#   3. Paired sample t-test: compare means from the same group at different times (say, one year apart)

# We will use independent samples t-test
group1 = df['fatheduc']
group2 = df['motheduc']

from scipy.stats import ttest_ind
ttest_ind(group1, group2)

TtestResult(statistic=nan, pvalue=nan, df=nan)

In [4]:
ttest_ind(group1, group2, equal_var=False)


TtestResult(statistic=nan, pvalue=nan, df=nan)