In [1]:
import pandas as pd
import numpy as np
import os

In [31]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import bigrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
import collections
import itertools
import re 
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))

# text_col
def get_words(text_col):
    return [text.strip().split(' ') for text in text_col]

def remove_stop_words(text_words):
    return [list(filter(lambda x: x not in stop_words, words)) for words in text_words]

def remove_extra_spaces(text):
    return re.sub("\s\s+", " ", text)

def remove_punctuation(text):
    return re.sub("[^\w\s]", "", text)

# look into part of speech tagging with nltk,
# would make lemmatization more powerful
def lemmatize(text_words):
    lemmatizer = WordNetLemmatizer()
    return [[lemmatizer.lemmatize(word) for word in words] for words in text_words]

def stem(df, word_col):
    stemmer = PorterStemmer()
    df[word_col] = df[word_col].apply(lambda x: stemmer.stem(x))
    return df

def stem2(text_words):
    stemmer = PorterStemmer()
    return [[stemmer.stem(word) for word in words] for words in text_words]

def preprocess(text_col, clean = "stem"):
    text_col = text_col.apply(remove_punctuation)
    text_col = text_col.apply(remove_extra_spaces)
    text_words = remove_stop_words(get_words(text_col))

    if clean == "lemmatize":
        text_words = lemmatize(text_words)
    else:
        text_words = stem2(text_words)

    return text_words

def get_word_counts(text_col, clean = "stem"):
    text_words = preprocess(text_col, clean = clean)

    # flatten 
    words = [word for words in text_words for word in words]
    word_counts = collections.Counter(words)

    # convert to DataFrame and return
    return pd.DataFrame(word_counts.most_common(), columns = ["word", "count"])

# return Pandas DataFrame containing bigrams
def get_bigrams(text_col, top_n = 500, clean = "stem"):
    # extract words from text_col, remove stop words, and lemmatize
    text_words = remove_stop_words(get_words(text_col))

    if clean == "lemmatize":
        text_words = lemmatize(text_words)
    else:
        text_words = stem(text_words)

    terms_bigram = [list(bigrams(words)) for words in text_words]
    bigrams = list(itertools.chain(*terms_bigram))
    bigram_counts = collections.Counter(bigrams)
    bigram_df = pd.DataFrame(bigram_counts.most_common(top_n), columns = ['bigram', 'count'])
    bigram_df['item1'] = bigram_df.bigram.apply(lambda x: x[0])
    bigram_df['item2'] = bigram_df.bigram.apply(lambda x: x[1])
    bigram_df = bigram_df.drop(columns=['bigram'])
    bigram_df = bigram_df[['item1', 'item2', 'count']]
    return bigram_df


def get_cooc_counts(text_col, clean = "stem"):
    text_words = preprocess(text_col, clean = clean)
    terms_cooc = [list(itertools.permutations(words, 2)) for words in text_words]
    cooc = list(itertools.chain(*terms_cooc))
    cooc_counts = collections.Counter(cooc)
    return cooc_counts

# get co-occurrences
def get_cooc(text_col, clean = "stem", min_word_count = 10):
    cooc_counts = get_cooc_counts(text_col, clean = clean)
    cooc_df = make_dataframe(cooc_counts.most_common(), columns = ['cooc', 'count'])
    word_counts = get_word_counts(text_col, clean = clean)

    # only look at words that occur >= min_word_count
    word_counts = word_counts[word_counts['count'] >= min_word_count]

    # filter cooc such that each cooc is comprised of words that occur at least a certain number of times
    # specified in the min_word_count variable
    cooc_df = cooc_df[(cooc_df.item1.isin(word_counts.word)) & (cooc_df.item2.isin(word_counts.word))]
    
    # return only odd columns since cooc are doubly represented as item1,item2 then item2,item1 on following row
    cooc_df = cooc_df.iloc[:-2:2]
    
    # remove rows where item1 == item2
    cooc_df = cooc_df[cooc_df.item1 != cooc_df.item2]
    
    return cooc_df

# columns should be a list of two elements: the first should contain the name of the column containing the word pair 
# (either a bigram or co-occurrence) and the second should contain the count. 
# columns[0] will be split into two columns, one for each word in the pair, and then the original column will be dropped.
def make_dataframe(data, columns):
    df = pd.DataFrame(data, columns = columns)
    df['item1'] = df.cooc.apply(lambda x: x[0])
    df['item2'] = df.cooc.apply(lambda x: x[1])
    df = df.drop(columns=[columns[0]])
    df = df[['item1', 'item2', columns[1]]]
    return df

def unnest_tokens(df, output_col, input_col, token = "words", to_lower = True, drop = True):
    text_col = df[input_col]
    tokens = {'id': [], output_col: []}
    for i, text in text_col.iteritems():
        for word in text.strip().split(' '):
            tokens['id'].append(i)
            tokens[output_col].append(word)
    tokens_df = pd.DataFrame(tokens)
    
    if to_lower:
        tokens_df[output_col] = tokens_df[output_col].apply(lambda x: x.lower())
        
    # remove entries where token is an empty string
    tokens_df = tokens_df[tokens_df[output_col] != ""]
    return tokens_df

def get_token2doc_map(df, token_col, doc_id_col):
    return { k[0]: { v: v for v in k[1].values } for k in df.groupby(token_col)[doc_id_col] }

def get_word_doc_count(x, y, n_docs, word2doc_map):
    # initialize counts
    has_only_x_count = 0
    has_only_y_count = 0
    has_both_count = 0
    has_neither_count = 0
    
    x_docs = word2doc_map[x]
    y_docs = word2doc_map[y]
    
    if len(x_docs) <= len(y_docs):
        for key in x_docs:
            if key in y_docs:
                has_both_count += 1
    else:
        for key in y_docs:
            if key in x_docs:
                has_both_count += 1
                
    has_only_x_count = len(x_docs) - has_both_count
    has_only_y_count = len(y_docs) - has_both_count
    has_neither_count = n_docs - has_both_count - has_only_x_count - has_only_y_count
    
    return {'only_x': has_only_x_count,
            'only_y': has_only_y_count,
            'both': has_both_count,
            'neither': has_neither_count}
    

# count the number of documents in which each word occurs, retuning a dict where the word, count is the key, value
def add_word_doc_counts(df, n_docs, word2doc_map):
    print("add_word_doc_counts called")
    
    has_item1_only = []
    has_item2_only = []
    has_both = []
    has_neither = []
    
    for i, row in tqdm(df.iterrows()):
        item1 = row['item1']
        item2 = row['item2']
        result = get_word_doc_count(item1, item2, n_docs, word2doc_map)
        
        # append counts to lists
        has_item1_only.append(result['only_x'])
        has_item2_only.append(result['only_y'])
        has_both.append(result['both'])
        has_neither.append(result['neither'])
        
    # add new columns to dataframe
    df['has_item1_only'] = has_item1_only
    df['has_item2_only'] = has_item2_only
    df['has_both'] = has_both
    df['has_neither'] = has_neither

    return df

def get_phi_coeff(only_x, only_y, both, neither):
    n11 = both
    n10 = only_x
    n01 = only_y
    n00 = neither
    n1_ = n11 + n10
    n_1 = n11 + n01
    n0_ = n00 + n01
    n_0 = n00 + n10
    
    phi = ((n11 * n00) - (n10 * n01)) / (n1_ * n0_ * n_0 * n_1) ** 0.5
    return phi

    
# get pairwise cor
def get_pairwise_cor(df, text_col, clean = "stem", min_word_count = 10):
    cooc_df = get_cooc(df[text_col], clean = clean, min_word_count = min_word_count)

    # get total number of documents
    n_docs = len(df[text_col])
    
    words = unnest_tokens(df, "word", text_col)
    words_stem = stem(words, "word")
    word2doc_map = get_token2doc_map(words_stem, "word", "id")

    # compute phi coefficient
    cor_df = add_word_doc_counts(cooc_df, n_docs, word2doc_map)
    
    cor_df['phi'] = get_phi_coeff(cor_df.has_item1_only, cor_df.has_item2_only, cor_df.has_both, cor_df.has_neither)

    # return dataframe
    return cor_df

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/chasedawson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [32]:
s32_sentences = pd.read_csv('../data/s32_sentences_sent_df.csv', index_col=0)
s32_sentences.sentence = s32_sentences.sentence.apply(remove_punctuation)
s32_sentences.sentence = s32_sentences.sentence.apply(remove_extra_spaces)
s32_sentences.sentence = s32_sentences.sentence.apply(lambda x: x.lower())

In [33]:
s32_sentences.head()

Unnamed: 0,response_id,sentence,sentiment,score
0,3,negro have been told many times they are fight...,NEGATIVE,0.968257
1,3,but a colored soldier is more discriminated ag...,NEGATIVE,0.953218
2,3,its evident to those who care to see it that n...,POSITIVE,0.973801
3,3,since the emancipation proclamation negro have...,NEGATIVE,0.823565
4,3,each right each privilege was fought for,POSITIVE,0.987902


In [34]:
words = unnest_tokens(s32_sentences, "word", "sentence")
words_stem = stem(words, "word")
word2doc_map = get_token2doc_map(words_stem, "word", "id")

In [26]:
len(s32_sentences.sentence)

19834

In [27]:
res = get_word_doc_count("white", "negro", len(s32_sentences.sentence), word2doc_map)
res

{'only_x': 1099, 'only_y': 1723, 'both': 1003, 'neither': 16009}

In [28]:
res['only_x'] + res['only_y'] + res['both'] + res['neither']

19834

In [30]:
get_phi_coeff(res['only_x'], res['only_y'], res['both'], res['neither'])

0.33971231434285887

In [35]:
cor_df = get_pairwise_cor(s32_sentences, "sentence")

732it [00:00, 7318.84it/s]

add_word_doc_counts called


192675it [00:13, 14589.13it/s]


In [36]:
cor_df.head()

Unnamed: 0,item1,item2,count,has_item1_only,has_item2_only,has_both,has_neither,phi
2,white,negro,1635,1099,1723,1003,16009,0.339712
6,soldier,white,1136,2074,1478,624,15658,0.161526
8,soldier,negro,1099,2015,2043,683,15093,0.133347
10,like,would,961,1085,1437,662,16650,0.275921
14,color,white,807,613,1613,489,17119,0.266147


In [17]:
cor_df.phi[2]

16057026.954582293

In [14]:
item1_check = "white"
item2_check = "negro"

both = 0
item1_count = 0
item2_count = 0
for doc in s32_sentences.sentence:
    found1 = doc.find(item1_check) != -1
    found2 = doc.find(item2_check) != -1
    
    if found1:
        if found2:
            both += 1
        else:
            item1_count += 1
    elif found2:
        item2_count += 1
        
print(both, item1_count, item2_count)

1006 1102 1721


In [100]:
words = unnest_tokens(s32_sentences, "word", "sentence")
words.head()

Unnamed: 0,id,word
0,0,negro
1,0,have
2,0,been
3,0,told
4,0,many


In [101]:
words_stem = stem(words, "word")
words_stem.head()

Unnamed: 0,id,word
0,0,negro
1,0,have
2,0,been
3,0,told
4,0,mani


In [102]:
word2doc_map = get_token2doc_map(words_stem, "word", "id")

In [117]:
np.array(list(word2doc_map['negro'].values()) + list(word2doc_map['have'].values())).unique()

AttributeError: 'numpy.ndarray' object has no attribute 'unique'

In [11]:
os.getcwd()

'/Users/chasedawson/dev/sdad/amsoldier/src'

In [45]:
remove_punctuation("this!! has asdfasdf ()()&&*&^")

'this has asdfasdf '

In [4]:
s32 = pd.read_csv('../data/s32_sent_df.csv', index_col=0)
s32_sentences = pd.read_csv('../data/s32_sentences_sent_df.csv', index_col=0)
s32.head()

Unnamed: 0,subject_ids,long,racial_group,outfits,outfits_comment,index,long_sentiment,long_score,outfits_sentiment,outfits_score
3,20738627,negro have been told many times they are fight...,black,,,3,NEGATIVE,0.74768,,0.0
4,20738629,i do not like the army. i had rather be on the...,black,,,4,NEGATIVE,0.996761,,0.0
5,20738631,"i think that if we are going to win this war, ...",black,,,5,NEGATIVE,0.996679,,0.0
6,20738633,why is it that the negro do not have the the ...,black,,,6,NEGATIVE,0.996114,,0.0
7,20738636,i highly approve of this questionnaire it give...,black,,,7,POSITIVE,0.99602,,0.0


In [5]:
s32_sentences.head()

Unnamed: 0,response_id,sentence,sentiment,score
0,3,negro have been told many times they are fight...,NEGATIVE,0.968257
1,3,but a colored soldier is more discriminated ag...,NEGATIVE,0.953218
2,3,its evident to those who care to see it that n...,POSITIVE,0.973801
3,3,since the emancipation proclamation negro have...,NEGATIVE,0.823565
4,3,"each right, each privilege was fought for.",POSITIVE,0.987902


In [None]:
s32_sentences

In [27]:
s32_sentences.shape

(19834, 4)

In [34]:
cooc_df = get_cooc(s32_sentences.sentence, min_word_count = 20)

In [35]:
cooc_df.shape

(326855, 3)

Unnamed: 0,response_id,sentence,sentiment,score
