## 1. Loading data in a python environment

In [None]:
!pip install researchpy

In [52]:
import pandas as pd 
import researchpy as rp
import scipy.stats as stats
import matplotlib.pyplot as plt
import numpy as np



In [53]:
df = pd.read_csv("/content/dataset w7.csv")
df.head(10)


Unnamed: 0,Gender,Test preparation,Total Marks
0,male,none,14
1,female,none,28
2,female,none,18
3,female,none,48
4,female,none,21
5,female,completed,40
6,male,none,30
7,female,none,12
8,male,none,18
9,male,none,24


In [54]:
df.describe()

Unnamed: 0,Total Marks
count,28.0
mean,32.321429
std,12.45452
min,12.0
25%,22.5
50%,33.0
75%,43.25
max,50.0


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Gender            28 non-null     object
 1   Test preparation  28 non-null     object
 2   Total Marks       28 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 800.0+ bytes


## 2. Checking whether female and male students scored the same marks

In [56]:
# Null Hypothesis H0 = Female and male students didn't score the same marks
# Alternate Hypothesis H1 = Female and male students scored the same marks

In [57]:
male_df = df[df['Gender'] == 'male']
female_df = df[df['Gender'] == 'female']

In [58]:
stats.ttest_ind(male_df['Total Marks'],female_df['Total Marks'])

Ttest_indResult(statistic=-0.6940885268848668, pvalue=0.4937822317960763)

In [59]:
# As the p-value obtained (0.4938) in greater than 0.05, we accept the null hypothesis that the scores for both males and females were different

In [60]:
rp.ttest(group1= male_df['Total Marks'], group1_name= "Male",
         group2= female_df['Total Marks'], group2_name= "Female")

(   Variable     N       Mean         SD        SE  95% Conf.   Interval
 0      Male  12.0  30.416667  12.168950  3.512873  22.684885  38.148448
 1    Female  16.0  33.750000  12.865976  3.216494  26.894205  40.605795
 2  combined  28.0  32.321429  12.454520  2.353683  27.492070  37.150787,
               Independent t-test  results
 0  Difference (Male - Female) =   -3.3333
 1          Degrees of freedom =   26.0000
 2                           t =   -0.6941
 3       Two side test p value =    0.4938
 4      Difference < 0 p value =    0.2469
 5      Difference > 0 p value =    0.7531
 6                   Cohen's d =   -0.2651
 7                   Hedge's g =   -0.2573
 8               Glass's delta =   -0.2739
 9                 Pearson's r =    0.1349)

In [61]:
summary, results = rp.ttest(group1= male_df['Total Marks'], group1_name= "Male", group2= female_df['Total Marks'], group2_name= "Female")
print(summary)

   Variable     N       Mean         SD        SE  95% Conf.   Interval
0      Male  12.0  30.416667  12.168950  3.512873  22.684885  38.148448
1    Female  16.0  33.750000  12.865976  3.216494  26.894205  40.605795
2  combined  28.0  32.321429  12.454520  2.353683  27.492070  37.150787


## 3. Checking whether test preparation helps the students

In [62]:
# Null Hypothesis: Test preparation does not help students

In [63]:
# Alternate Hypothesis: Test preparation helps students

In [64]:
testcompleted_df = df[df['Test preparation'] == 'completed']
testnone_df = df[df['Test preparation'] == 'none']

In [65]:
rp.ttest(group1= testcompleted_df['Total Marks'], group1_name= "Completed",
         group2= testnone_df['Total Marks'], group2_name= "Not Completed")

(        Variable     N       Mean         SD        SE  95% Conf.   Interval
 0      Completed   7.0  44.571429   3.359422  1.269742  41.464482  47.678375
 1  Not Completed  21.0  28.238095  11.661495  2.544747  22.929846  33.546344
 2       combined  28.0  32.321429  12.454520  2.353683  27.492070  37.150787,
                           Independent t-test  results
 0  Difference (Completed - Not Completed) =   16.3333
 1                      Degrees of freedom =   26.0000
 2                                       t =    3.6144
 3                   Two side test p value =    0.0013
 4                  Difference < 0 p value =    0.9994
 5                  Difference > 0 p value =    0.0006
 6                               Cohen's d =    1.5774
 7                               Hedge's g =    1.5315
 8                           Glass's delta =    4.8619
 9                             Pearson's r =    0.5783)

In [66]:
# The p-value 0.0013 is lesser than the value of alpha, which is 0.05, thus we can reject the null hypothesis
# and say that test preparation does indeed help students get better scores