In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import cluster
import matplotlib.pyplot as plt
from scipy.spatial import distance
from mpl_toolkits.mplot3d import Axes3D
pd.options.display.max_rows = 4000

In [2]:
data = pd.read_csv("data/data_for_analysis.csv")
# Remove sign
data['IntensityMeanAbsVelocity'] = np.abs(data['IntensityMeanAbsVelocity'])

In [3]:
data.head()

Unnamed: 0,author_last_name,author_first_name,recording_url,birth_year,spoken_word,cave_canem_indicator,region,city_of_birth,state,country,...,PauseRate,PauseDutyCycle,MeanPauseDuration,ComplexityAllPauses,ComplexitySyllables,ComplexityPhrases,IntensitySegmentMeanSD,IntensityMeanAbsVelocity,IntensityMeanAbsAccel,poet_full_name
0,Adams,Samantha,,1996,False,False,Midwest,,Wisconsin,,...,1.346801,0.487093,0.361667,9.739216,15.261978,4.193274,13.801149,149.1868,120.134374,Adams Samantha
1,Adams,Samantha,,1996,False,False,Midwest,,Wisconsin,,...,0.826322,0.400841,0.485091,6.404722,14.183274,3.091935,14.306229,159.312214,132.202711,Adams Samantha
2,Adisa,Opal Palmer,,1954,False,True,Caribbean,Kingston,,Jamaica,...,0.92143,0.281922,0.305962,6.968303,13.079815,2.672056,4.096043,277.537265,305.414392,Adisa Opal Palmer
3,Adisa,Opal Palmer,,1954,False,True,Caribbean,Kingston,,Jamaica,...,0.607203,0.268844,0.442759,5.259354,12.131653,2.768081,3.45894,191.883661,196.840812,Adisa Opal Palmer
4,Alexander,Elizabeth,,1962,False,True,Northeast,Harlem,New York,USA,...,0.69245,0.252505,0.364654,5.517232,13.599243,2.288253,3.513307,205.459264,210.177286,Alexander Elizabeth


In [4]:
data.columns

Index(['author_last_name', 'author_first_name', 'recording_url', 'birth_year',
       'spoken_word', 'cave_canem_indicator', 'region', 'city_of_birth',
       'state', 'country', 'childhood_note', 'undergrad_study_indicator',
       'graduate_study_indicator', 'iowa_mfa?', 'public_private_indicator',
       'Ivy', 'hbcu', 'year_of_recording', 'recording', 'source_of_recording',
       'cave_canem_indicator.1', 'poet_laureate', 'major_award', 'venue_type',
       'slam_(1)_or_not_(0)?', 'venue', 'poem_title', 'URLs ', 'Dynamism',
       'f0Mean', 'f0Range2sd', 'f0Entropy', 'f0MeanAbsVelocity',
       'f0MeanAbsAccel', 'PauseCount', 'PauseRate', 'PauseDutyCycle',
       'MeanPauseDuration', 'ComplexityAllPauses', 'ComplexitySyllables',
       'ComplexityPhrases', 'IntensitySegmentMeanSD',
       'IntensityMeanAbsVelocity', 'IntensityMeanAbsAccel', 'poet_full_name'],
      dtype='object')

# Major Award | Spoken Word

## Contingency Table

In [8]:
pd.crosstab(data["spoken_word"], data["major_award"], margins=True)

major_award,0,1,All
spoken_word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,65,52,117
True,70,14,84
All,135,66,201


## P(Major Award | Spoken Word)

In [21]:
condtional_probability_df = pd.crosstab(data["spoken_word"], data["major_award"], normalize="index")
condtional_probability_df

major_award,0,1
spoken_word,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.555556,0.444444
True,0.833333,0.166667


In [23]:
# Sanity check
65 / 117

0.5555555555555556

## Odds of Winning Major Award Condition on Spoken Word
First row: $P(award = 1 | word = 0) / P(award = 0 | word = 0)$.

Second row: $P(award = 1 | word = 1) / P(award = 0 | word = 1)$.

In [28]:
condtional_probability_df[1] / condtional_probability_df[0]

spoken_word
False    0.8
True     0.2
dtype: float64

## Odds Ratio ($OR$)
The odds ratio is literally the ratio of odds we just compute. However, it isn't easy to interpret as it appears.
Some folks (especially those in public health/epidemiology) like to use the odds ratio to measure correlations between the treatment and outcome. 
Odds Ratio compares the odds of outcome (award) when the treatment (Spoken Word) moves from exposed to unexposed. 
When $OR = 1$, the odds of different outcomes stay the same regardless of the outcome. So there is no association. 
When $OR$ is either close to 0 or much larger than 1, there is a strong association between the treatment and outcome.

In [29]:
0.2 / 0.8

0.25

# Major Award | Ivy League

In [33]:
pd.crosstab(data["Ivy"], data["major_award"], margins=True)

major_award,0,1,All
Ivy,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,109,48,157
1.0,23,17,40
All,132,65,197


In [35]:
condtional_probability_df = pd.crosstab(data["Ivy"], data["major_award"], normalize="index")
condtional_probability_df

major_award,0,1
Ivy,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,0.694268,0.305732
1.0,0.575,0.425


In [38]:
# Odds
condtional_probability_df[1] / condtional_probability_df[0]

Ivy
0.0    0.440367
1.0    0.739130
dtype: float64

In [43]:
# Odds ratio
0.739130 / 0.440367

1.678440936764108

# Major Award | Cave Canem

In [40]:
pd.crosstab(data["cave_canem_indicator"], data["major_award"], margins=True)

major_award,0,1,All
cave_canem_indicator,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,77,20,97
True,58,46,104
All,135,66,201


In [41]:
condtional_probability_df = pd.crosstab(data["cave_canem_indicator"], data["major_award"], normalize="index")
condtional_probability_df

major_award,0,1
cave_canem_indicator,Unnamed: 1_level_1,Unnamed: 2_level_1
False,0.793814,0.206186
True,0.557692,0.442308


In [42]:
condtional_probability_df[1] / condtional_probability_df[0]

cave_canem_indicator
False    0.259740
True     0.793103
dtype: float64

In [45]:
# Odds ratio
0.793103 / 0.259740

3.0534496034496033

# Correlations?
We can construct confidence intervals for these $OR$'s, but that would require careful deliberation over the relationship between these factors and the causal graph. Otherwise, we may introduce different kinds of bias. Furthermore, controlling bias is not as simple as controlling as many factors (throw all the factors into a logistic regression model) as we can think of. See the following link if you would like some references.
https://catalogofbias.org/biases/collider-bias/ 
I don't think we should go down the rabbit hole in this paper.

My suggestion is that since the odds ratio seems to be "far" away from 1, we can say "the $OR$ suggests that xxx factor could be correlated with the chance of getting major awards." It's fair to say that these $OR$'s indicate that the relationship between these factors and outcomes are worth studying.