https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html#scipy.stats.pearsonr
https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pointbiserialr.html 

The word/ngram counts (x) as well as the citation count (y) should each be normalized/divided by document length to create a 0-1 measure (very small for y!), and each correlation between citation # and predictor/count/score should be calculated separately (to avoid multicollinearity/messing up results).

Visualization: heat map, with separate rows for each discipline and separate columns for each count/score. There would be one per dictionary and we could array them one on top of the other. So it would look something like this, but with 5 columns (word count, word2vec, doc2vec, GloVe, and InferSent)

In [1]:
import pandas as pd
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./counts_and_subject.csv')
df["culture_author_count"] = df["cultural_author_count"]
df["edited_filename"] = df["article_id"].apply(lambda x: x[16:])
df.columns

  interactivity=interactivity, compiler=compiler, result=result)


Index(['article_id', 'culture_ngram_count', 'culture_ngram_count.1',
       'relational_ngram_count', 'relational_ngram_count.1',
       'demographic_ngram_count', 'demographic_ngram_count.1', 'word_count',
       'cultural_author_count', 'demographic_author_count',
       'relational_author_count', 'primary_subject', 'year',
       'culture_author_count', 'edited_filename'],
      dtype='object')

In [3]:
embeddings_df = pd.read_csv('../../models_storage/word_embeddings_data/text_with_cosine_scores_wdg_aug16.csv')
embeddings_df.columns

Index(['Unnamed: 0', 'Unnamed: 0_x', 'Unnamed: 0.1', 'filename_x',
       'edited_filename', 'culture', 'demographic', 'relational',
       'Unnamed: 0_y', 'filename_y', 'text', 'relational_doc2vec_cosine',
       'demographic_doc2vec_cosine', 'culture_doc2vec_cosine',
       'relational_word2vec_cosine', 'demographic_word2vec_cosine',
       'culture_word2vec_cosine', 'relational_glove_cosine',
       'demographic_glove_cosine', 'culture_glove_cosine'],
      dtype='object')

In [4]:
# Add cosine scores
df = df.merge(embeddings_df, how='left', on='edited_filename')
df.head()

Unnamed: 0,article_id,culture_ngram_count,culture_ngram_count.1,relational_ngram_count,relational_ngram_count.1,demographic_ngram_count,demographic_ngram_count.1,word_count,cultural_author_count,demographic_author_count,...,text,relational_doc2vec_cosine,demographic_doc2vec_cosine,culture_doc2vec_cosine,relational_word2vec_cosine,demographic_word2vec_cosine,culture_word2vec_cosine,relational_glove_cosine,demographic_glove_cosine,culture_glove_cosine
0,journal-article-10.2307_2065002,9,3,11,5,10,7,3529,0,0,...,toward more cumulative inquiry e university m...,0.097787,0.014634,0.028077,0.666242,0.659942,0.644007,0.734771,0.692074,0.580011
1,journal-article-10.2307_3380821,10,6,27,7,7,5,5195,0,0,...,a jefferson county department health w duncm u...,0.012444,0.037296,0.008317,0.67587,0.610163,0.53278,0.6999,0.605508,0.448152
2,journal-article-10.2307_2095822,57,6,61,24,26,61,7100,0,0,...,society a model j university urbana-champaign ...,0.010615,0.080786,0.079067,0.688963,0.669807,0.646967,0.77896,0.756689,0.629216
3,journal-article-10.2307_2631839,2,0,4,0,0,0,315,0,0,...,vol no printed usa litigation a note colle...,0.022005,0.033838,0.031417,0.662404,0.618887,0.53729,0.720018,0.644975,0.527974
4,journal-article-10.2307_40836133,74,52,91,28,30,7,7110,0,0,...,mir special issue pp mir management interna...,0.00265,0.035686,0.065617,0.784473,0.732319,0.678748,0.802714,0.738454,0.588695


In [5]:
print(len(df))
df = df.dropna()
print(len(df))

69658
69657


In [6]:
perspectives = ["culture", "relational", "demographic"]

# Divide counts by word lengths
def divide_by_length(pattern, new_pattern):
    for p in perspectives:
        header = pattern.format(p)
        df[new_pattern.format(p)] = df[header] / df["word_count"]

In [7]:
divide_by_length("{}_ngram_count", "{}_ngram_ratio")
divide_by_length("{}_ngram_count.1", "{}_ngram_ratio_core")
divide_by_length("{}_author_count", "{}_author_ratio")

In [8]:
stats_labels = ['year', 'primary_subject']
stats_labels += ['{}'.format(p) for p in perspectives]
stats_labels += ['{}_word2vec_cosine'.format(p) for p in perspectives]
stats_labels += ['{}_doc2vec_cosine'.format(p) for p in perspectives]
stats_labels += ['{}_glove_cosine'.format(p) for p in perspectives]
stats_labels += ['{}_ngram_ratio'.format(p) for p in perspectives]

stats_df = df.loc[:, stats_labels]

In [9]:
rename_dict = {}
for p in perspectives:
    rename_dict['{}'.format(p)] = "InferSent: {}".format(p)
    rename_dict['{}_word2vec_cosine'.format(p)] = "Word2Vec: {}".format(p)
    rename_dict['{}_doc2vec_cosine'.format(p)] = "Doc2Vec: {}".format(p)
    rename_dict['{}_glove_cosine'.format(p)] = "GloVe: {}".format(p)
    rename_dict['{}_ngram_ratio'.format(p)] = "Word Count: {}".format(p)

def clean_year(s):
    try:
        return int(s) - 1970
    except:
        return int(s[:4]) - 1970

def format_subject(s):
    if s == 'Sociology':
        return 0
    elif s == 'Management & Organizational Behavior':
        return 1
    else:
        print(s)
    
    
stats_df = stats_df.rename(rename_dict, axis=1)
stats_df['year'] = stats_df['year'].apply(clean_year)
stats_df["Management/OB"] = stats_df['primary_subject'].apply(format_subject)
stats_df.columns

Index(['year', 'primary_subject', 'InferSent: culture',
       'InferSent: relational', 'InferSent: demographic', 'Word2Vec: culture',
       'Word2Vec: relational', 'Word2Vec: demographic', 'Doc2Vec: culture',
       'Doc2Vec: relational', 'Doc2Vec: demographic', 'GloVe: culture',
       'GloVe: relational', 'GloVe: demographic', 'Word Count: culture',
       'Word Count: relational', 'Word Count: demographic', 'Management/OB'],
      dtype='object')

In [10]:
stats_df = stats_df[['Management/OB', 'year', 'InferSent: culture',
       'InferSent: relational', 'InferSent: demographic', 'Word2Vec: culture',
       'Word2Vec: relational', 'Word2Vec: demographic', 'Doc2Vec: culture',
       'Doc2Vec: relational', 'Doc2Vec: demographic', 'GloVe: culture',
       'GloVe: relational', 'GloVe: demographic', 'Word Count: culture',
       'Word Count: relational', 'Word Count: demographic']]

In [16]:
def calculate_pvalues(df):
    df = df.dropna()._get_numeric_data()
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(stats.pearsonr(df[r], df[c])[1], 4)
    return pvalues

calculate_pvalues(stats_df).to_csv('p_values.csv')

In [None]:
top_labels_dict = {"Management/OB": "Mgt/OB"}
for p in perspectives:
    top_labels_dict['InferSent: {}'.format(p)] = "IS_{}".format(p[0])
    top_labels_dict["Word2Vec: {}".format(p)] = "w2v_{}".format(p[0])
    top_labels_dict['Doc2Vec: {}'.format(p)] = "d2v_{}".format(p[0])
    top_labels_dict['GloVe: {}'.format(p)] = "GloVe_{}".format(p[0])
    top_labels_dict['Word Count: {}'.format(p)] = "wv_{}".format(p[0])

descriptive = stats_df.describe()
descriptive = descriptive.drop(["count", "25%", "75%"])
descriptive = descriptive.rename({"50%": "median"})
descriptive = descriptive.rename(top_labels_dict, axis=1)

print(descriptive.to_latex(float_format=lambda x: '%10.2f' % x))


In [None]:
correlations = stats_df.corr()


In [None]:
correlations = correlations.where(pd.np.tril(pd.np.ones(correlations.shape), k=0).astype(bool), other='')
print(correlations.to_latex(float_format=lambda x: '%10.2f' % x, header=False, na_rep=''))

In [None]:
print(descriptive.to_html(float_format=lambda x: '%10.2f' % x))

In [None]:
print(correlations.to_html(float_format=lambda x: '%10.2f' % x, header=False, na_rep=''))

In [None]:
def cell_entries(df, X_pattern, Y_pattern="{}_author_count", binary=True):
    results = []
    for perspective in perspectives:
        X = df[X_pattern.format(perspective)]
        Y = df["{}_author_count".format(perspective)]
        if binary:
            Y = Y.apply(lambda x: x>0)
            results.append(stats.pointbiserialr(X, Y))
        else:
            results.append(stats.pearsonr(X, Y))
    return results
        

In [None]:
def draw_heatmaps(sociology_data, management_data, x_axis_labels): 
    for sociology_values, management_values, p in zip(sociology_data, management_data, perspectives):
        p_values = [[x[1] for x in sociology_values], [x[1] for x in management_values]] 
        data = [[x[0] for x in sociology_values], [x[0] for x in management_values]] 
        y_axis_labels = ["Sociology", "Management"]

        ax = sns.heatmap(data, vmin=0, vmax=1, annot=True, xticklabels=x_axis_labels, yticklabels=y_axis_labels)
        ax.set(title="Correlation coefficients: {}".format(p), xlabel='Method', ylabel='Discipline')
        plt.show()
        print(p_values)
    

In [None]:
# ngrams, word2vec, GloVe
# doc2vec, InferSent
x_columns = ["{}_ngram_ratio", "{}_word2vec_cosine", "{}_doc2vec_cosine", "{}_glove_cosine", "{}"]
x_labels = ["word counts", "word2vec", "doc2vec", 'GloVe', "InferSent"]

sociology_data_by_perspective = list(zip(*[cell_entries(sociology_df, pattern) for pattern in x_columns]))
management_data_by_perspective = list(zip(*[cell_entries(management_df, pattern) for pattern in x_columns]))

draw_heatmaps(sociology_data_by_perspective, management_data_by_perspective, x_labels)

In [None]:
# ngrams, word2vec, GloVe
# doc2vec, InferSent
# Binary=False
x_columns = ["{}_ngram_ratio", "{}_word2vec_cosine", "{}_doc2vec_cosine", "{}_glove_cosine", "{}"]
x_labels = ["word counts", "word2vec", "doc2vec", 'GloVe', "InferSent"]

sociology_data_by_perspective = list(zip(*[cell_entries(sociology_df, pattern, binary=False) for pattern in x_columns]))
management_data_by_perspective = list(zip(*[cell_entries(management_df, pattern, binary=False) for pattern in x_columns]))

draw_heatmaps(sociology_data_by_perspective, management_data_by_perspective, x_labels)

In [None]:
# Missing: journal-article-10.2307_41555234 ?

In [None]:
# ngrams, word2vec, GloVe

x_columns = ["{}_ngram_ratio", "{}_word2vec_cosine", "{}_glove_cosine"]
x_labels = ["word counts", "word2vec", 'GloVe']

sociology_data_by_perspective = list(zip(*[cell_entries(sociology_df, pattern, binary=False) for pattern in x_columns]))
management_data_by_perspective = list(zip(*[cell_entries(management_df, pattern, binary=False) for pattern in x_columns]))

draw_heatmaps(sociology_data_by_perspective, management_data_by_perspective, x_labels)

In [None]:
from pprint import pprint

culture = [[0.0, 0.0, 0.36913518824567426, 5.313799646534656e-92, 1.9287584364686678e-11], [3.116014230190429e-257, 5.4075376502e-313, 0.7003093478555227, 4.109894087852923e-184, 1.1270350075594382e-05]]
relational = [[1.9232441544035583e-204, 5.532905628892022e-175, 0.3054166444469383, 2.1323565357620456e-59, 2.786248557262824e-12], [2.884954470804473e-125, 1.1495671830072461e-83, 0.8703868792197232, 2.3077755088312828e-58, 3.3454889981238377e-16]]
demographic = [[0.0, 0.0, 0.9885402206621577, 3.2096252926170165e-69, 1.4718465272770002e-05], [0.0, 0.0, 0.2395178195976957, 2.470031463969842e-124, 0.0006245780224592454]]
print("Culture")
pprint(culture)
print("Relational")
pprint(relational)
print("Demographic")
pprint(demographic)

In [None]:
culture = [[0.0, 1.745576531942312e-213, 0.36478768364038494, 2.430114143490269e-62, 4.96915476041219e-09], [7.54701063500108e-296, 2.1571749627490213e-198, 0.9615391422999973, 3.7518603436551587e-112, 2.9727910648863322e-06]]
relational = [[1.9232441544035583e-204, 5.532905628892022e-175, 0.3054166444469383, 2.1323565357620456e-59, 2.786248557262824e-12], [2.884954470804473e-125, 1.1495671830072461e-83, 0.8703868792197232, 2.3077755088312828e-58, 3.3454889981238377e-16]]
demographic = [[0.0, 2.783691510701036e-172, 0.5826043675265578, 1.134132601269464e-28, 0.0066636006395310995], [0.0, 1.2092581148611741e-206, 0.5961832285070008, 7.720266642198692e-66, 0.003216882882131651]]

In [None]:
print("Culture")
pprint(culture)
print("Relational")
pprint(relational)
print("Demographic")
pprint(demographic)