In [4]:
from transformers import pipeline
import pandas as pd 
import numpy as np
from nltk.tokenize import sent_tokenize
import re
from tqdm import tqdm

In [2]:
s32 = pd.read_csv("../data/s32_clean.csv", index_col=0)
s32.head()

Unnamed: 0,subject_ids,long,racial_group,outfits,outfits_comment,index
1,20738624,_,black,,,1
2,20738626,_,black,,,2
3,20738627,negro have been told many times they are fight...,black,,,3
4,20738629,i do not like the army. i had rather be on the...,black,,,4
5,20738631,"i think that if we are going to win this war, ...",black,,,5


In [8]:
s144 = pd.read_csv("../data/s144_clean_v2.csv", index_col=0)
s144.head()

Unnamed: 0,id,subject_id,image_name,document_type,q85
1,0,15901748,2521127-37-0001.jpg,Other,
2,1,15901753,2521127-37-0002.jpg,Other,
3,2,15901759,2521127-37-0003.jpg,Other,
4,3,15901768,2521127-37-0004.jpg,A document that contains Q.85,concerning question i havent the slightest ide...
5,4,15901775,2521127-37-0005.jpg,A document that contains Q.85,after the war i hope we americans will have a ...


In [9]:
s144.q85[6]

'the army would be swell for a certain amount of training for young men or anyone who would be interested. thats if they were citizens of the u.s. and if they could take up a trade which post war plans have without being separated from their families too long or too far away. the army really builds a person up would be alright for a lot of people who just like to travel. i think anyone who wanted to fight had sufficient training whether he was in a combat unit or working battalion should be able to volunteer with the outfit he preferred without so much red tape. even though i guess it is necessary. '

In [17]:
def remove_underscores(text):
    return re.sub("_", "", text)

def remove_extra_spaces(text):
    return re.sub("\s\s+", " ", text)

In [22]:
def preprocess(df, text):
    df[text] = df[text].fillna(value="")
    df[text] = df[text].apply(remove_underscores)
    df[text] = df[text].apply(remove_extra_spaces)
    df = df[df[text] != ""]
    df = df[df[text] != " "]
    return df

In [43]:
def sentence_level_sentiment(df, text, index):
    # break df into sentences
    sentences = []
    for i in tqdm(range(len(df))):
        text_ = df[text].iloc[i]
        index_ = df[index].iloc[i]
        for token in sent_tokenize(text_):
            sentences.append((index_, token))
    sentence_df = pd.DataFrame(sentences, columns=[index, "sentence"])
    
    # compute sentiment analysis on sentences and add to sentence df
    nlp = pipeline('sentiment-analysis')
    sentiment = []
    score = []
    for i in tqdm(range(len(sentence_df))):
        text_ = sentence_df.sentence.iloc[i]
        text_ = text_[:512]
        result = nlp(text_)[0]
        sentiment.append(result['label'])
        score.append(result['score'])    
    sentence_df['sentiment'] = sentiment
    sentence_df['score'] = score
    
    return sentence_df

def response_level_sentiment(df, text):
    sentiment = []
    score = []

    nlp = pipeline('sentiment-analysis')

    for i in tqdm(range(len(df))):
        text_ = df[text].iloc[i]
        text_ = text_[:512] # model can only handle sequences of max length 512 so truncate
        result = nlp(text_)[0]
        sentiment.append(result['label'])
        score.append(result['score'])

    df['sentiment'] = sentiment
    df['score'] = score
    
    return df

In [23]:
s32_p = preprocess(s32, "long")
s32_p.head()

Unnamed: 0,subject_ids,long,racial_group,outfits,outfits_comment,index
3,20738627,negro have been told many times they are fight...,black,,,3
4,20738629,i do not like the army. i had rather be on the...,black,,,4
5,20738631,"i think that if we are going to win this war, ...",black,,,5
6,20738633,why is it that the negro do not have the the a...,black,,,6
7,20738636,i highly approve of this questionnaire it give...,black,,,7


In [34]:
s32_sentence_sentiment = sentence_level_sentiment(s32_p, "long", "index")
s32_sentence_sentiment.head()

100%|██████████| 5460/5460 [00:00<00:00, 7851.36it/s]
100%|██████████| 19834/19834 [24:05<00:00, 13.73it/s]    


Unnamed: 0,index,sentence,sentiment,score
0,3,negro have been told many times they are fight...,NEGATIVE,0.968257
1,3,but a colored soldier is more discriminated ag...,NEGATIVE,0.953218
2,3,its evident to those who care to see it that n...,POSITIVE,0.973801
3,3,since the emancipation proclamation negro have...,NEGATIVE,0.823565
4,3,"each right, each privilege was fought for.",POSITIVE,0.987902


In [45]:
s32_response_sentiment = response_level_sentiment(s32_p, "long")
s32_response_sentiment.head()

100%|██████████| 5460/5460 [07:41<00:00, 11.83it/s]


Unnamed: 0,subject_ids,long,racial_group,outfits,outfits_comment,index,sentiment,score
3,20738627,negro have been told many times they are fight...,black,,,3,NEGATIVE,0.74768
4,20738629,i do not like the army. i had rather be on the...,black,,,4,NEGATIVE,0.996761
5,20738631,"i think that if we are going to win this war, ...",black,,,5,NEGATIVE,0.996679
6,20738633,why is it that the negro do not have the the a...,black,,,6,NEGATIVE,0.996114
7,20738636,i highly approve of this questionnaire it give...,black,,,7,POSITIVE,0.99602


In [46]:
s144.head()

Unnamed: 0,id,subject_id,image_name,document_type,q85
1,0,15901748,2521127-37-0001.jpg,Other,
2,1,15901753,2521127-37-0002.jpg,Other,
3,2,15901759,2521127-37-0003.jpg,Other,
4,3,15901768,2521127-37-0004.jpg,A document that contains Q.85,concerning question i havent the slightest ide...
5,4,15901775,2521127-37-0005.jpg,A document that contains Q.85,after the war i hope we americans will have a ...


In [47]:
s144_p = preprocess(s144, "q85")

In [48]:
s144_sentence_sentiment = sentence_level_sentiment(s144_p, "q85", "id")
s144_sentence_sentiment.head()

100%|██████████| 2348/2348 [00:00<00:00, 7221.39it/s]
100%|██████████| 8729/8729 [05:55<00:00, 24.58it/s]


Unnamed: 0,id,sentence,sentiment,score
0,3,concerning question i havent the slightest ide...,POSITIVE,0.997207
1,4,after the war i hope we americans will have a ...,NEGATIVE,0.99742
2,5,the army would be swell for a certain amount o...,NEGATIVE,0.991619
3,5,thats if they were citizens of the u.s. and if...,NEGATIVE,0.703719
4,5,the army really builds a person up would be al...,POSITIVE,0.998122


In [49]:
s144_response_sentiment = response_level_sentiment(s144_p, "q85")
s144_response_sentiment.head()

100%|██████████| 2348/2348 [03:11<00:00, 12.26it/s]


Unnamed: 0,id,subject_id,image_name,document_type,q85,sentiment,score
4,3,15901768,2521127-37-0004.jpg,A document that contains Q.85,concerning question i havent the slightest ide...,POSITIVE,0.997207
5,4,15901775,2521127-37-0005.jpg,A document that contains Q.85,after the war i hope we americans will have a ...,NEGATIVE,0.99742
6,5,15901787,2521127-37-0006.jpg,A document that contains Q.85,the army would be swell for a certain amount o...,POSITIVE,0.89035
7,6,15901795,2521127-37-0007.jpg,A document that contains Q.85,well i think uncle sam will give the veterans ...,NEGATIVE,0.992004
8,7,15901805,2521127-37-0008.jpg,A document that contains Q.85,i think the army is a great place to be at but...,NEGATIVE,0.999651


In [98]:
s32_sentence_sentiment.to_csv("../data/s32_sentence_sentiment.csv")
s32_response_sentiment.to_csv("../data/s32_response_sentiment.csv")
s144_sentence_sentiment.to_csv("../data/s144_sentence_sentiment.csv")
s144_response_sentiment.to_csv("../data/s144_response_sentiment.csv")

In [5]:
s32_sentence_sentiment = pd.read_csv("../data/s32_sentence_sentiment.csv")
s32_response_sentiment = pd.read_csv("../data/s32_response_sentiment.csv")
s144_sentence_sentiment = pd.read_csv("../data/s144_sentence_sentiment.csv")
s144_response_sentiment = pd.read_csv("../data/s144_response_sentiment.csv")

In [7]:
from statsmodels.stats.proportion import proportions_ztest

# perform one proportion z-test
zstat, pval = proportions_ztest(count = s32_response_sentiment.sentiment.value_counts()['NEGATIVE'],
                                nobs = len(s32_response_sentiment.sentiment),
                                value = 0.5)

print("s32 negative", pval)

zstat, pval = proportions_ztest(count = s32_response_sentiment.sentiment.value_counts()['POSITIVE'],
                                nobs = len(s32_response_sentiment.sentiment),
                                value = 0.5)

print("s32 positive", pval)

zstat, pval = proportions_ztest(count = s144_response_sentiment.sentiment.value_counts()['NEGATIVE'],
                                nobs = len(s144_response_sentiment.sentiment),
                                value = 0.5)

print("s144 negative", pval)

zstat, pval = proportions_ztest(count = s144_response_sentiment.sentiment.value_counts()['POSITIVE'],
                                nobs = len(s144_response_sentiment.sentiment),
                                value = 0.5)
print("s144 positive", pval)


s32 negative 0.0
s32 positive 0.0
s144 negative 1.324980429900372e-145
s144 positive 1.324980429900372e-145


In [8]:
# !rm text_analysis.py

In [1]:
!ln -s ../../../text_analysis/text_analysis.py text_analysis.py
import text_analysis

ln: text_analysis.py: File exists


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
from text_analysis import unnest_tokens, remove_stop_words, stem, filter_out, pairwise_count, pairwise_cor

def sentiment_word_analysis_pipeline(df, text):
    words = unnest_tokens(df, 'word', text)
    words_nsw = remove_stop_words(words, 'word')
    words_stemmed = stem(words_nsw, 'word')
    words_f = filter_out(words_stemmed, 'word', 10)
    cooc = pairwise_count(words_f, 'word', 'id')
    cor = pairwise_cor(words_f, 'word', 'id', cooc_df = cooc.copy())
    return cooc, cor

In [12]:
s144_pos_cooc, s144_pos_cor = sentiment_word_analysis_pipeline(s144_sentence_sentiment[s144_sentence_sentiment.sentiment == "POSITIVE"], "sentence")
s144_pos_cooc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[word_col] = df[word_col].apply(lambda x: stemmer.stem(x))
 24%|██▍       | 6514/26879 [00:00<00:00, 65133.54it/s]

add_word_doc_counts called


100%|██████████| 26879/26879 [00:00<00:00, 69933.47it/s]


Unnamed: 0,item1,item2,count
2,like,would,169
4,state,unit,101
8,war,think,77
10,hope,war,76
12,negro,war,75


In [21]:
s144_pos_cor.sort_values("phi", ascending = False).head(10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
2128,jim,crow,10,2,0,10,2627,0.912524
4,state,unit,101,36,32,91,2480,0.71458
550,arm,forc,21,8,17,21,2593,0.627993
70,regardless,creed,47,41,16,47,2535,0.620781
2,like,would,169,106,90,141,2302,0.549547
1088,forward,look,15,2,35,15,2587,0.509931
42,creed,race,56,11,122,52,2454,0.478586
332,color.,regardless,27,8,61,27,2543,0.476608
1576,thank,you.,12,26,5,12,2596,0.467377
36,race,regardless,60,116,30,58,2435,0.443927


In [15]:
s144_neg_cooc, s144_neg_cor = sentiment_word_analysis_pipeline(s144_sentence_sentiment[s144_sentence_sentiment.sentiment == "NEGATIVE"], "sentence")
s144_neg_cooc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[word_col] = df[word_col].apply(lambda x: stemmer.stem(x))
  5%|▌         | 4614/86105 [00:00<00:01, 46133.99it/s]

add_word_doc_counts called


100%|██████████| 86105/86105 [00:01<00:00, 63757.36it/s]


Unnamed: 0,item1,item2,count
2,war,think,284
4,think,armi,268
10,soldier,negro,237
12,better,war,228
14,war,negro,204


In [22]:
s144_neg_cor.sort_values('phi', ascending = False).head(10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
186,jim,crow,82,25,3,69,5789,0.836545
8554,hall,mess,9,1,15,9,5861,0.579991
30,state,unit,176,148,87,160,5491,0.559623
4710,blood,shed,14,26,2,14,5844,0.551726
1518,regardless,creed,29,42,14,29,5801,0.520504
5932,new,orlean,12,28,1,11,5846,0.506998
13800,com,non,6,4,8,6,5868,0.506114
3814,corp,air,16,7,16,11,5852,0.497125
4092,forces.,arm,15,1,45,14,5826,0.468499
1024,forc,arm,36,55,27,32,5772,0.439932


In [32]:
def view_word(df, word, value, n = 10):
    return df[(df.item1 == word) | (df.item2 == word)].sort_values(value, ascending = False).head(n)

In [39]:
view_word(s144_pos_cor, "negro", "phi", 10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
98,white,negro,41,77,199,37,2326,0.175083
378,south,negro,25,25,217,19,2378,0.156238
40,soldier,negro,57,134,198,38,2269,0.12168
3990,negro,today,7,230,5,6,2398,0.10339
182,negro,given,32,208,100,28,2303,0.10232
2942,privileg,negro,8,8,229,7,2395,0.099951
2642,treatment,negro,9,8,229,7,2395,0.099951
4934,deserv,negro,6,6,230,6,2397,0.097243
166,negro,condit,34,211,95,25,2308,0.090946
208,better,negro,31,96,211,25,2307,0.09002


In [34]:
view_word(s144_neg_cor, "negro", "phi", 10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
10,soldier,negro,237,430,518,170,4768,0.174539
58,white,negro,141,243,576,112,4955,0.156607
732,negro,troop,43,656,21,32,5177,0.144453
526,negro,southern,52,646,95,42,5103,0.091138
230,given,negro,76,188,628,60,5010,0.08163
394,negro,equal,59,640,137,48,5061,0.079938
3880,soldiers.,negro,15,16,674,14,5182,0.077922
736,american,negro,43,75,656,32,5123,0.077156
1896,treatment,negro,25,40,666,22,5158,0.076415
104,negro,fight,114,608,303,80,4895,0.075535


In [35]:
view_word(s144_pos_cor, "white", "phi", 10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
3146,white,black,8,106,6,8,2519,0.189749
98,white,negro,41,77,199,37,2326,0.175083
144,color,white,35,108,89,25,2417,0.16405
3916,white,man.,7,107,8,7,2517,0.157486
90,soldier,white,43,144,86,28,2381,0.155324
2368,south,white,9,35,105,9,2490,0.103342
438,peopl,white,24,135,96,18,2390,0.090847
3958,break,white,7,23,108,6,2502,0.084875
8454,white,whether,4,110,11,4,2514,0.083107
3940,still,white,7,25,108,6,2500,0.080628


In [36]:
view_word(s144_neg_cor, "white", "phi", 10)

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
64,color,white,139,322,272,83,5209,0.165138
58,white,negro,141,243,576,112,4955,0.156607
2192,white,black,22,338,21,17,5510,0.131059
120,white,man,102,287,338,68,5193,0.122538
864,southern,white,40,104,322,33,5427,0.117084
8864,sign,white,9,8,346,9,5523,0.10605
1830,white,troop,25,338,36,17,5495,0.104282
292,white,offic,68,319,150,36,5381,0.101098
320,white,race,66,299,334,56,5197,0.093183
376,peopl,white,62,274,309,46,5257,0.084037
