In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
acl = pd.read_csv('AustinCityLimits.csv')

In [4]:
def chisq_test(actual, expected):
    """The inputs have to be arrays of equal dimensions
    """
    chi2 = np.sum(np.square(actual - expected) / expected)
    return chi2
  

def expected_table_values(table):
    """`table` must be a pandas dataframe (the best way of creating the dataframe for 
        our purposes here is with pandas.crosstab)
    """
    total = table.sum().sum()
    row_totals = table.sum()
    col_totals = table.sum(axis=1)
    
    expected = np.repeat(col_totals.values, table.shape[1]).reshape(table.shape) * row_totals.values / total
    
    return expected    

## Pre-Lab
1. Are there an equal number of male and female performers on Austin City Limits?
2. Are male performers just as likely to have had a Top 10 hit as female performers?

### 1

Remember that THE null hypothesis of the goodness of fit test, is, that the distribution of the categories are equal. The alternative hypothesis states that the distributions are different. It is the standard anyway, I suppose there may be ocasions on which this is not the case.

In [4]:
# We are testing the distribution of gender here, therefore 2 categories. 
actual_dist = acl.Gender.value_counts().values
expected_dist = np.full(2, len(acl) / 2)

chi2_prelab1 = chisq_test(actual_dist, expected_dist)


print('chi^2 goodness of fit:', chi2_prelab1)

# REJECT

chi^2 goodness of fit: 18.24137931034483


### 2
In an Independence chi squared test the **null hypothesis states** that the variables are  **independent**. The **alternative hypothesis states** that the variables are **not independent**.

In [0]:
count_table = pd.crosstab(acl.Gender, acl['BB.wk.top10'])

In [6]:
print('independence chi^2:', chisq_test(count_table.values, expected_table_values(count_table)))

# FAIL TO REJECT.

independence chi^2: 0.7002283753981873


scipy by default calculates the contingency chi2 with correction, hence, correction=False which gives the result that is expected in the grader (I got it wrong, should've trusted my funcs)





In [7]:
from scipy.stats import chi2_contingency

chi2_contingency(count_table, correction=False)

(0.7002283753981873, 0.40270696744709167, 1, array([[16.98058252, 16.01941748],
        [36.01941748, 33.98058252]]))

## Lab
1. Are each of the four musical genres equally represented on Austin City Limits?   
2. Are some genres more likely to draw a large (100K+) Twitter following than others?

### 1

In [8]:
expected_genre_counts = np.full(4, len(acl) / 4)
observed_genre_counts = acl.Genre.value_counts().values

chisq_test(observed_genre_counts, expected_genre_counts)

chi2_lab1 = chisq_test(observed_genre_counts, expected_genre_counts)

print('chi2:', chi2_lab1)

# REJECT. The difference of the distributions of the genres are statistically significant.

chi2: 70.41379310344827


### 2

In [9]:
contingency_tab = pd.crosstab(acl.Genre, acl['Twitter.100k'])
contingency_tab

Twitter.100k,0.0,1.0
Genre,Unnamed: 1_level_1,Unnamed: 2_level_1
Country,11,6
Jazz/Blues,9,2
Rock/Folk/Indie,33,26
Singer-Songwriter,6,10


In [10]:
print('contingency chi2:', chisq_test(contingency_tab.values, expected_table_values(contingency_tab)))

# FAIL TO REJECT

contingency chi2: 5.691892115012296


In [11]:
chi2_contingency(contingency_tab, correction=False)

(5.691892115012296, 0.12760135359813357, 3, array([[ 9.73786408,  7.26213592],
        [ 6.30097087,  4.69902913],
        [33.7961165 , 25.2038835 ],
        [ 9.16504854,  6.83495146]]))

## Problem Set

### Q1
You want to know if the proportion of female performers on Austin City Limits Live has changed in the past two years. 
1. Create a new variable in the dataset called "Recent" that is equal to a 1 for rows from years 2012 or 2013 and is equal to 0 for all other rows.
2. Make a table that shows the number of male and female performers in "recent" and non-recent years.


In [0]:
acl['Recent'] = (acl.Year == 2012) | (acl.Year == 2013)

In [25]:
contingency_tab_q1 = pd.crosstab(acl.Gender, acl['Recent'])
contingency_tab_q1

Recent,False,True
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,23,12
M,65,16


In [15]:
expected_table_values(contingency_tab_q1)

array([[26.55172414,  8.44827586],
       [61.44827586, 19.55172414]])

In [17]:
indep_chisq_q1 = chisq_test(contingency_tab_q1.values, expected_table_values(contingency_tab_q1))

print('chi2:', indep_chisq_q1)

# Fail to reject

chi2: 2.8187635997159806


In [21]:
from scipy.stats import chisquare

#p-value
chi2_contingency(contingency_tab_q1)[1]

0.14914306537135258

### Q2
When crossing white and yellow summer squash, a genetic model predicts that 75% of resulting offspring will be white, 15% will be yellow and 10% will be green. 
Below are the results from an experiment run on a random sample of 205 squash offspring. 

In [23]:
# respectively: white, yellow, green
expected_q2 = np.full(3, 205) * np.array([.75, .15, .1])
observed_q2 = np.array([152, 39, 14]) # these are the results of the experiment

chi2_q2 = chisq_test(observed_q2, expected_q2)

print('chi2:', chi2_q2)

# fail to reject

chi2: 4.2943089430894315


### Q3

In [26]:
data_q3 = pd.DataFrame({'Id': np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 
                                        11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]),
                        
                        'Gender': np.array(['M', 'M', 'F', 'M', 'F', 'F', 'F', 'M', 'F', 'F', 
                                            'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F']),
                        
                        'DominantHand': np.array(['L', 'R', 'R', 'R', 'R', 'L', 'L', 'R', 'R', 'R', 
                                                  'L', 'R', 'R', 'R', 'R', 'R', 'R', 'R', 'L', 'R', 'R'])
                                        
                       })
data_q3

Unnamed: 0,DominantHand,Gender,Id
0,L,M,1
1,R,M,2
2,R,F,3
3,R,M,4
4,R,F,5
5,L,F,6
6,L,F,7
7,R,M,8
8,R,F,9
9,R,F,10


In [28]:
contingency_tab_q3 = pd.crosstab(data_q3.Gender, data_q3.DominantHand)
contingency_tab_q3

DominantHand,L,R
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1
F,3,8
M,2,8


In [29]:
expected_table_values(contingency_tab_q3)

# Can't run the analysis, the left hand expected counts for both genders are less than 5

array([[2.61904762, 8.38095238],
       [2.38095238, 7.61904762]])

### Q4
A telephone survey asked a random sample of Indiana voters about their home internet usage, as well as what type of community (rural, suburban or urban) they lived in. 
Of the 123 survey respondents, 28 were from rural areas, 42 were from suburban areas, and 53 were from urban areas.  Thirteen rural respondents, 35 suburban respondents, and 50 urban respondents said they had access to internet at home. 

** Expected counts ** \\

---
| CommType |  Internet: no | Internet: yes |
| -----------------  |  ------------------- | ------------------ |
| rural               | 5.69                  | 22.30             |
| suburban     | 8.53                 | 33.46              |
| urban            | 10.77                |  42.22           | 




In [0]:
observed_q4 = np.array([[15, 13], [7, 35], [3, 50]])
expected_q4 = np.array([[5.69, 22.30], [8.53, 33.46], [10.77, 42.22]])

In [32]:
# chi2
chisq_test(observed_q4, expected_q4)

# REJECT. The community type is related with having or not internet.

26.49614083182542