# Select random sample of articles (of manageable size) for hand-coding

- Author: Jaren Haber
- PI: Heather Haveman
- Date: December 12, 2019

In [1]:
# import packages, set directories
import pandas as pd
import os
ocr_wd = '/home/jovyan/work/jstor_data/ocr/'
sample_wd = '/home/jovyan/work/Computational-Analysis-For-Social-Science/WordEmbedding/sample_generation/sample_output2/'
output_dir = sample_wd + "random_5_sociology_cultural_glove/"

In [76]:
def read_articles_sample(n2above = 1, n1above = 1, nmedian = 1, n1below = 1, n2below = 1, 
                         perspective = "all", discipline = "all", model = "all", explicit = False, 
                         sample_dir = '/home/jovyan/work/Computational-Analysis-For-Social-Science/WordEmbedding/sample_generation/'):
    
    """Produces a random sample for one or more perspectives (cultural, relational, and/or demographic) 
    for one or more disciplines (sociology and/or management & OB) produced using 
    one or more models (custom word2vec, custom doc2vec, pre-trained GloVe, re-trained InferSent). 
    
    Size of (stratified) random sample is defined by:
    n2above: number of articles two standard deviations or so above the mean
    n1above: number of articles one standard deviation or so above the mean
    nmedian: number of articles at/around the median
    n1below: number of articles one standard deviation below the mean
    n2below: number of articles two standard deviations below the mean
    
    Inputs:
    model: glove (not 'GloVe'), w2v (for Word2Vec), d2v (for Doc2Vec), infersent (not 'InferSent')
    discipline: cultural, relational, or demographic
    explicit: Whether to save as filenames the SD specification of each article samples (binary)"""
    
    if n2above > 10:
        print("Error: Only 10 random samples available, changing n to 10")
        n2above = 10
    # To do: Apply this for other n, too
    
    # Calculate total number in sample (for filename purposes)
    ntotal = n2above + n1above + nmedian + n1below + n2below 
        
    # Load data
    if discipline in ["sociology", "all", "Sociology", "All", "ALL"]:
        soc2above = pd.read_csv(sample_dir + "sociology_" + perspective + "_" + model + "_2above.csv")
        soc1above = pd.read_csv(sample_dir + "sociology_" + perspective + "_" + model + "_1above.csv")
        socmedian = pd.read_csv(sample_dir + "sociology_" + perspective + "_" + model + "_median.csv")
        soc1below = pd.read_csv(sample_dir + "sociology_" + perspective + "_" + model + "_1below.csv")
        soc2below = pd.read_csv(sample_dir + "sociology_" + perspective + "_" + model + "_2below.csv")
        
        # Add SD specification if needed, also take random sample
        if explicit:
            soc2above = pd.DataFrame([("2above_" + item + "txt") for item in soc2above.sample(n = n2above)["filename"].to_list()])
            soc1above = pd.DataFrame([("1above_" + item + "txt") for item in soc1above.sample(n = n1above)["filename"].to_list()])
            socmedian = pd.DataFrame([("median_" + item + "txt") for item in socmedian.sample(n = nmedian)["filename"].to_list()])
            soc1below = pd.DataFrame([("1below_" + item + "txt") for item in soc1below.sample(n = n1below)["filename"].to_list()])
            soc2beow = pd.DataFrame([("2below_" + item + "txt") for item in soc2below.sample(n = n2below)["filename"].to_list()])
        
        # Take random sample for each specification (only relevant if explicit not done)
        if not explicit:
            soc2above = soc2above.sample(n = n2above)
            soc1above = soc1above.sample(n = n1above)
            socmedian = soc1above.sample(n = nmedian)
            soc1below = soc1above.sample(n = nmedian)
            soc2below = soc1above.sample(n = nmedian)
        
        # Assemble chunks into list of samples
        samples_soc = [soc1above, soc2above, soc1below, soc2below, socmedian]
        
        # Reduce to filenames and add file suffix to end
        # Only relevant if explicit is False (otherwise this has been done above)
        if not explicit:
            samples_soc = [pd.DataFrame([(item + ".txt") for item in df["filename"].to_list()]) for df in samples_soc]
        
        #for tup in list(zip(samples_soc, ["2above", "1above", "median", "1below", "2below"])):
        #    print(tup[0], tup[1])
        
        # Add SD specification
        #samples_soc = [pd.DataFrame([(str(tup[1]) + str(item) + ".txt") for item in tup[0].values.tolist()]) in tup for tup in 
        #               list(zip(samples_soc, ["2above", "1above", "median", "1below", "2below"]))]
        
        # Randomly sample one article for each location in sociology cultural distribution, then merge these:
        samplesoc = pd.concat(objs = [pd.Series([df.sample(n=n2above) for df in samples_soc]), 
                                      pd.Series([df.sample(n=n1above) for df in samples_soc]), 
                                      pd.Series([df.sample(n=nmedian) for df in samples_soc]), 
                                      pd.Series([df.sample(n=n1below) for df in samples_soc]), 
                                      pd.Series([df.sample(n=n2below) for df in samples_soc])])
        
        # Save to file
        #samplesoc.to_csv("random_5_sociology_cultural_glove.csv", index=False, header=False)

        
    if discipline in ["management", "all", "Management", "Mgt", "Management/OB", "All", "ALL"]:
        mgt1above = pd.read_csv(sample_dir + "management_" + perspective + "_" + model + "_2above.csv")
        mgt1below = pd.read_csv(sample_dir + "management_" + perspective + "_" + model + "_1above.csv")
        mgt2above = pd.read_csv(sample_dir + "management_" + perspective + "_" + model + "_median.csv")
        mgt2below = pd.read_csv(sample_dir + "management_" + perspective + "_" + model + "_1below.csv")
        mgtmedian = pd.read_csv(sample_dir + "management_" + perspective + "_" + model + "_2below.csv")
        
        # Assemble chunks into list of samples
        samples_mgt = [mgt2above, mgt1above, mgt1below, mgt2below, mgtmedian]
        
        # Reduce to filenames and add file suffix to end
        samples_mgt = [pd.DataFrame([(item + ".txt") for item in df["filename"].to_list()]) for df in samples_mgt]
        
        # Add SD specification
        samples_mgt = [pd.DataFrame([(tup[1] + item + ".txt") for item in tup[0].values.tolist()]) in tup for tup in 
                       list(zip(samples_mgt, ["2above", "1above", "median", "1below", "2below"]))]
        
        # Randomly sample one article for each location in sociology cultural distribution, then merge these:
        samplemgt = pd.concat(objs = [pd.Series([mgt2above["filename"].sample(n=n2above) for df in samples_mgt]), 
                                      pd.Series([mgt1above["filename"].sample(n=n1above) for df in samples_mgt]), 
                                      pd.Series([mgtmedian["filename"].sample(n=nmedian) for df in samples_mgt]), 
                                      pd.Series([mgt1below["filename"].sample(n=n1below) for df in samples_mgt]), 
                                      pd.Series([mgt2below["filename"].sample(n=n2below) for df in samples_mgt])])
        
        # Add file suffix
        #samplemgt = pd.DataFrame([(item + ".txt") for item in samplemgt.to_list()])
        
        # Save to file
        #samplesoc.to_csv("random_5_sociology_cultural_glove.csv", index=False, header=False)
        
    return

In [77]:
# Read data
#soc_cult_glove_1above = pd.read_csv(sample_wd + "sociology_cultural_glove_1above.csv")
#soc_cult_glove_1below = pd.read_csv(sample_wd + "sociology_cultural_glove_1below.csv")
#soc_cult_glove_2above = pd.read_csv(sample_wd + "sociology_cultural_glove_2above.csv")
#soc_cult_glove_2below = pd.read_csv(sample_wd + "sociology_cultural_glove_2below.csv")
#soc_cult_glove_median = pd.read_csv(sample_wd + "sociology_cultural_glove_median.csv")

read_articles_sample(discipline = "sociology", perspective = "cultural", model = "w2v", sample_dir = sample_wd, explicit = True)



In [23]:
# Check out data
soc_cult_glove_2above

Unnamed: 0.1,Unnamed: 0,filename,subject,avg_w2v,avg_d2v,avg_glove,edited_filename,culture,demographic,relational
0,10568,journal-article-10.2307_27522646,Sociology,[ 0.0035587 0.0232043 -0.13497226 0.066047...,[ 0.0035587 0.0232043 -0.13497226 0.066047...,[-9.86717384e-02 8.44770422e-02 4.40761072e-...,10.2307_27522646,0.624045,0.627629,0.463476
1,5296,journal-article-10.2307_2112729,Sociology,[-0.00946396 0.04525458 -0.04747958 0.067015...,[-0.00946396 0.04525458 -0.04747958 0.067015...,[ 6.14944535e-02 -2.24818541e-02 -2.56911788e-...,10.2307_2112729,0.598908,0.606213,0.429015
2,25256,journal-article-10.2307_20831289,Sociology,[-0.0101557 0.05878482 -0.17050494 0.014481...,[-0.0101557 0.05878482 -0.17050494 0.014481...,[-5.59949628e-02 9.68039244e-02 -4.35385517e-...,10.2307_20831289,0.615992,0.622204,0.452738
3,22103,journal-article-10.2307_40005270,Sociology,[-0.02619507 0.03163803 -0.14005832 0.048058...,[-0.02619507 0.03163803 -0.14005832 0.048058...,[-4.60622467e-02 4.93645439e-02 -5.48975875e-...,10.2307_40005270,0.611184,0.616316,0.445329
4,5745,journal-article-10.2307_201892,Sociology,[-0.02584431 0.02999491 -0.1700929 0.017322...,[-0.02584431 0.02999491 -0.1700929 0.017322...,[-8.63692247e-02 1.46993179e-01 -9.88325984e-...,10.2307_201892,0.609624,0.617633,0.445758
5,6594,journal-article-10.2307_2094889,Sociology,[ 0.01679956 0.0821901 -0.1529196 0.067782...,[ 0.01679956 0.0821901 -0.1529196 0.067782...,[-5.68744679e-02 3.33312072e-02 -1.04347850e-...,10.2307_2094889,0.6006,0.606336,0.431286
6,6744,journal-article-10.2307_44816279,Sociology,[-0.01953658 -0.04425031 -0.10642783 0.000843...,[-0.01953658 -0.04425031 -0.10642783 0.000843...,[-1.61496104e-01 1.56701283e-01 -1.39808001e-...,10.2307_44816279,0.613195,0.619872,0.449647
7,13585,journal-article-10.2307_41475254,Sociology,[-0.09194329 -0.00215025 -0.13291511 0.075185...,[-0.09194329 -0.00215025 -0.13291511 0.075185...,[-1.98269881e-01 1.78340608e-01 2.20268047e-...,10.2307_41475254,0.617424,0.623119,0.453552
8,21868,journal-article-10.1525_sp.2009.56.4.cover,Sociology,[-0.01885741 0.04020007 -0.15415756 0.069681...,[-0.01885741 0.04020007 -0.15415756 0.069681...,[-2.40935190e-02 6.81079072e-02 -4.62666414e-...,10.1525_sp.2009.56.4.cover,0.58377,0.590098,0.417981
9,4912,journal-article-10.2307_1389226,Sociology,[-3.88675109e-02 1.51553219e-02 -1.53314605e-...,[-3.88675109e-02 1.51553219e-02 -1.53314605e-...,[-1.20187214e-01 1.34400131e-01 -4.42794321e-...,10.2307_1389226,0.616013,0.622186,0.454763


In [4]:
samples_list = [soc_cult_glove_1above, soc_cult_glove_2above, soc_cult_glove_1below, soc_cult_glove_2below, soc_cult_glove_median]

# Randomly sample one article for each location in sociology cultural distribution, then merge these:
sample = pd.concat(objs = [df["filename"].sample(n=1) for df in samples_list])

sample

6    journal-article-10.2307_42863074
3    journal-article-10.2307_40005270
5    journal-article-10.2307_42861052
0      journal-article-10.2307_353815
9     journal-article-10.2307_3033921
Name: filename, dtype: object

In [11]:
# Add file suffix
sample = pd.DataFrame([(item + ".txt") for item in sample.to_list()])
sample

Unnamed: 0,0
0,journal-article-10.2307_42863074.txt
1,journal-article-10.2307_40005270.txt
2,journal-article-10.2307_42861052.txt
3,journal-article-10.2307_353815.txt
4,journal-article-10.2307_3033921.txt


In [13]:
# Save to file
sample.to_csv("random_5_sociology_cultural_glove.csv", index=False, header=False)

In [29]:
# Get full list of articles
colnames = ['file_name']
articles = pd.read_csv('/home/jovyan/work/Computational-Analysis-For-Social-Science/Dictionary Mapping/Pipe/filtered_index.csv', names=colnames, header=None)
articles

Unnamed: 0,file_name
0,journal-article-10.2307_2065002
1,journal-article-10.2307_3380821
2,journal-article-10.2307_2095822
3,journal-article-10.2307_2631839
4,journal-article-10.2307_40836133
...,...
69654,journal-article-10.2307_jcorpciti.19.5
69655,journal-article-10.2307_24720853
69656,journal-article-10.2307_4494934
69657,journal-article-10.2307_40542267


In [None]:
files_to_be_opened = [ocr_wd + file + '.txt' for file in sample.to_list()]
all_files = [ocr_wd + f for f in os.listdir(ocr_wd) if isfile(join(ocr_wd, f))]

files = [file for file in all_files if file in files_to_be_opened]

In [None]:
files[:50]