# 3. H1: The experimental group will get better grades. Hypothesis testing with Python

* **author** = Diego Sapunar-Opazo
* **copyright** = Copyright 2019, Thesis M.Sc. Diego Sapunar - Pontificia Universidad Católica de Chile
* **credits** = Diego Sapunar-Opazo, Ronald Perez, Mar Perez-Sanagustin, Jorge Maldonado-Mahauad
* **maintainer** = Diego Sapunar-Opazo
* **email** = dasapunar@uc.cl
* **status** = Dev

## Part 0: Import Packages

In [2]:
# data analysis and wrangling
import pandas as pd
import numpy as np

# stats
from scipy.stats import ttest_ind

## Part 1: Getting the Data

In [5]:
df_sec = pd.read_csv('../../data/clean_data/students_sec.csv')
df_clusters = pd.read_csv('../../data/final_data/students_clusters.csv')
df_performance = pd.read_csv('../../data/final_data/performance.csv')
df_GPA = pd.read_csv('../../data/clean_data/students_GPA.csv')

## Part 2: Hypothesis testing

In [11]:
def t_test(data1, data2, variables=[]):
    data = {'item': [], 'mean1': [], 'std1': [], 'mean2': [], 'std2': [], 't': [], 'p': []}
    for var in variables:
        data['item'].append(var)
        print('VARIABLE: ', var)
        
        data1_mean = data1[var].mean()
        data2_mean = data2[var].mean()
        data['mean1'].append(data1_mean)
        data['mean2'].append(data2_mean)
        print("data1 mean value:", data1_mean)
        print("data2 mean value:", data2_mean)
        
        data1_std = data1[var].std()
        data2_std = data2[var].std()
        data['std1'].append(data1_std)
        data['std2'].append(data2_std)
        print("data1 std value:",data1_std)
        print("data2 std value:",data2_std)
        
        ttest,pval = ttest_ind(data1[var],data2[var])
        data['t'].append(ttest)
        data['p'].append(pval)
        print("t-value",ttest)
        print("p-value",pval)
        print()
        
    return pd.DataFrame(data)
        

In [12]:
variables = ['I1', 'I2', 'I3', 'FC1', 'FC2', 'FC3', 'FC4', 
             'FC5', 'FC6', 'FC7', 'FC8', 'FC9', 'FC10', 'FC11', 
             'report', 'workshops', 'lecture_participation', 'NF', 'GPA']

variables = ['GPA']

### (2.1) Sections

In [6]:
df = pd.merge(left=df_sec, right=df_performance, left_on='num_alumno', right_on='num_alumno')
df = pd.merge(left=df, right=df_GPA, left_on='num_alumno', right_on='num_alumno')

In [7]:
df.head()

Unnamed: 0,num_alumno,sec,I1,I2,I3,FC1,FC2,FC3,FC4,FC5,...,FC7,FC8,FC9,FC10,FC11,report,workshops,lecture_participation,NF,GPA
0,14633000,1,4.6,4.9,5.2,6.2,6.5,7.0,6.4,6.2,...,5.8,5.2,3.3,3.3,4.8,6.2,3.3,5.9,5.3,4.85
1,15639703,1,3.7,5.3,5.8,5.4,5.7,6.2,5.6,6.4,...,5.8,4.2,5.0,5.5,5.8,7.0,4.3,1.0,5.5,5.01
2,13636456,1,5.1,4.9,4.1,6.4,6.1,5.6,5.1,5.8,...,5.8,4.7,5.4,4.9,5.2,4.7,6.3,1.0,5.0,4.65
3,14637421,1,4.8,5.2,5.6,5.9,7.0,6.5,6.8,5.0,...,5.1,5.0,5.0,4.5,4.8,6.6,5.3,5.9,5.9,5.02
4,16622359,1,4.5,5.3,5.4,5.0,5.0,5.8,6.0,5.7,...,5.2,6.3,2.3,4.3,4.6,6.6,4.8,1.0,5.1,5.2


In [8]:
len(df)

211

In [13]:
data1, data2 = df[df['sec'] == 1], df[df['sec'] == 2]
aux = t_test(data1, data2, variables)
# aux.to_csv('../../results/h1_sections.csv', index=False)

VARIABLE:  GPA
data1 mean value: 5.108404255319149
data2 mean value: 5.018888888888889
data1 std value: 0.3867323911276232
data2 std value: 0.44090771920917593
t-value 1.5473206466486475
p-value 0.12329880915990875



### (2.2) Clusters

In [14]:
df_c = pd.merge(left=df, right=df_clusters, left_on='num_alumno', right_on='num_alumno')

#### (2.2.1) EFFORT (a.k.a Blue)

In [15]:
data1, data2 = df_c[(df_c['sec'] == 1) & (df_c['cluster'] == 0)], df_c[(df_c['sec'] == 2) & (df_c['cluster'] == 0)]
aux = t_test(data1, data2, variables)
# aux.to_csv('../../results/h1_cluster0.csv', index=False)

VARIABLE:  GPA
data1 mean value: 5.097391304347826
data2 mean value: 5.039534883720931
data1 std value: 0.27102863638208124
data2 std value: 0.295408945671461
t-value 0.7796505944875136
p-value 0.4384702806529722



#### (2.2.2) PRIOR KNOWLEDGE (a.k.a Orange)

In [16]:
data1, data2 = df_c[(df_c['sec'] == 1) & (df_c['cluster'] == 1)], df_c[(df_c['sec'] == 2) & (df_c['cluster'] == 1)]
aux = t_test(data1, data2, variables)
# aux.to_csv('../../results/h1_cluster1.csv', index=False)

VARIABLE:  GPA
data1 mean value: 5.5825000000000005
data2 mean value: 5.576190476190477
data1 std value: 0.3126604851070381
data2 std value: 0.3359683942051126
t-value 0.0621703814076328
p-value 0.9507446850419785



#### (2.2.3) NO EFFORT (a.k.a Green)

In [17]:
data1, data2 = df_c[(df_c['sec'] == 1) & (df_c['cluster'] == 2)], df_c[(df_c['sec'] == 2) & (df_c['cluster'] == 2)]
aux = t_test(data1, data2, variables)
# aux.to_csv('../../results/h1_cluster2.csv', index=False)

VARIABLE:  GPA
data1 mean value: 4.900487804878049
data2 mean value: 4.736
data1 std value: 0.2821165647344391
data2 std value: 0.31814204213583436
t-value 2.46350224303494
p-value 0.015931974940320254

