In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
posts = "data//facebook_congress_posts.csv"
responses = "data//facebook_congress_responses.csv"

In [3]:
postsDf = pd.read_csv(posts)

In [4]:
responsesDf = pd.read_csv(responses)

## Basic value counts

In [5]:
postsDf.head()

Unnamed: 0,op_id,op_gender,post_id,post_text,post_type
0,57265377,M,0,"Yesterday, my colleagues and I voted to protec...",video
1,57265377,M,1,Roses are red...and so is Texas. Let's keep it...,video
2,57265377,M,2,#TBT to this classic video. #DonkeyWhisperer,video
3,57265377,M,3,Since President Donald J. Trump was sworn in o...,video
4,57265377,M,4,Remembering our 40th president today. LIKE to ...,video


In [6]:
# this is the number of unique posts
# not all the posts necessarily have responses 
postsDf['post_id'].nunique()

548341

In [7]:
#occurrences of each post type
postsDf['post_type'].value_counts()

link      210499
photo     199123
status     87921
video      46842
event       3417
music        278
note         261
Name: post_type, dtype: int64

In [8]:
responsesDf.head()

Unnamed: 0,op_id,op_gender,post_id,responder_id,response_text,op_name,op_category
0,57265377,M,0,Jerry,Protecting birth is not the same as protecting...,Roger Williams,Congress_Republican
1,57265377,M,0,Andrea,You need to protect children and leave my body...,Roger Williams,Congress_Republican
2,57265377,M,0,Sherry,Thank you,Roger Williams,Congress_Republican
3,57265377,M,0,Bob,Thank you Roger,Roger Williams,Congress_Republican
4,57265377,M,0,Joy,Unwanted pregnancy is a sad and unfortunate si...,Roger Williams,Congress_Republican


In [9]:
responsesDf.shape

(13866507, 7)

In [None]:
#number of unique congress members 
len(responsesDf['op_id'].unique())

In [14]:
# number of male and female congresspeople in the data set 
unique_congresspeople = responsesDf.drop_duplicates('op_name')
unique_congresspeople.op_gender.value_counts()

M    306
W     96
Name: op_gender, dtype: int64

In [None]:
# TODO: change this merge so that we ONLY get the names, not all the other stuff 
unique_congresspeople = unique_congresspeople.filter(items=['op_name', 'op_id'])
postsDf = postsDf.merge(unique_congresspeople, how='left', on='op_id')

In [None]:
grouped_posts = postsDf.groupby('op_id')

In [None]:
# gut check to be sure every op_id just lines up with one name
# it looks like none of them have more than 1 name, but 5 of them are missing a name?
# that seems like it needs to be cleaned up 
test = grouped_posts['op_name'].nunique()
print(max(test))
print(min(test))
print(len([x for x in test if x==0]))

In [None]:
# this shows us that there are 5 more congressperson groups than we had names 
len(grouped_posts)

### Cells for exploring unnamed congresspeople

In [None]:
postsDf[postsDf['op_name'].isnull()].op_id.value_counts()

In [None]:
# googling some of these and playing around helped me figure out that one of the blank ids is Scott Tipton
# we can probably use google to figure out the other ones too

print(postsDf[postsDf['op_id']==42721680]['post_text'][224602])
# according to Google, this was said by Scott Tipton

print("\n" + postsDf[postsDf['op_id']==44922372]['post_text'][508441])
# according to Google, this was said by Martin Heinrich

print("\n" + postsDf[postsDf['op_id']==54138093]['post_text'][260136])
# according to Google, this was said by Mac Thornberry

print("\n" + postsDf[postsDf['op_id']==96418867]['post_text'][511455])
# according to Google, this was said by Darren Soto

print("\n" + postsDf[postsDf['op_id']==58288189]['post_text'][466543])
# this one doesn't really seem like it was said by a real congressperson
# we should remove it from the set

In [None]:
# confirm that none of the congress people we just identified are already in the original set
all_congress_names = unique_congresspeople.op_name.values
no_matches_found = True
for name in ['Tipton', 'Thornberry', 'Heinrich', 'Soto']:
    for confirmed_name in all_congress_names:
        if name in confirmed_name:
            print("Match found: {} and {}".format(name, confirmed_name))
            no_matches_found = False
if no_matches_found:
    print("no matches found!")

In [None]:
# add in all the correct names
postsDf.loc[postsDf.op_id==42721680, 'op_name'] = 'Scott Tipton'
postsDf.loc[postsDf.op_id==44922372, 'op_name'] = 'Martin Heinrich'
postsDf.loc[postsDf.op_id==54138093, 'op_name'] = 'Mac Thornberry'
postsDf.loc[postsDf.op_id==96418867, 'op_name'] = 'Darren Soto'

In [None]:
# confirm - there should just be one now
postsDf[postsDf['op_name'].isnull()]

In [None]:
# drop the one
print(postsDf.shape)
postsDf = postsDf[~postsDf['op_name'].isnull()]
print(postsDf.shape)

In [None]:
# remake the grouped df with these changes 
# there should be 406 groups now 
grouped_posts = postsDf.groupby('op_id')
len(grouped_posts)

In [None]:
# make separate male and female grouped posts dfs
f_grouped_posts=postsDf[postsDf['op_gender']=='W'].groupby('op_id')
print(len(f_grouped_posts))
m_grouped_posts=postsDf[postsDf['op_gender']=='M'].groupby('op_id')
len(m_grouped_posts)

## Overall Distributions

### Number of Posts per Congress Member

In [None]:
# distribution of number of posts per congress person
plt.hist(grouped_posts.size(), bins=20)
plt.xlabel("Number of posts")
plt.ylabel("Number of congress members")
plt.title("Number of posts per Congress Member")
plt.show()

grouped_posts.size().describe()

In [None]:
plt.hist(m_grouped_posts.size(), bins=20, label='M')
plt.hist(f_grouped_posts.size(), bins=20, label='F')
plt.legend()
plt.xlabel("Number of posts")
plt.ylabel("Number of congress members")
plt.title("Number of posts per Congress Member")
plt.show()

In [None]:
print("Male:\n") 
print(m_grouped_posts.size().describe())
print("\nFemale:\n")
print(f_grouped_posts.size().describe())

**Finding**: While there are more male than female congresspeople, men and women have around the same number of posts on average, and the distributions of number of posts look pretty similar for men and women. 

### Number of Words Per Post

In [None]:
# distribution of word counts of congresspeople's posts
## TODO: make tokenization a little better here
def get_word_count(row):
    post = str(row['post_text'])
    return len(post.split())

postsDf['word_count'] = postsDf.apply(get_word_count, axis=1)

In [None]:
#various views of post distribution 
plt.hist(postsDf['word_count'], range=(0,500), bins=20)
plt.xlabel("Number of words")
plt.ylabel("Number of posts")
plt.title("Number of words per post")

postsDf['word_count'].describe()

In [None]:
# very skewed data set. even though max is 3000ish, 
# most of the data is in this plot 
plt.hist(postsDf['word_count'], range=(0,100), bins=20)
plt.xlabel("Number of words")
plt.ylabel("Number of posts")
plt.title("Number of words per post")

In [None]:
plt.hist(postsDf[postsDf.op_gender=='M']['word_count'], range=(0,100), bins=20, label='M')
plt.hist(postsDf[postsDf.op_gender=='W']['word_count'], range=(0,100), bins=20, label='F')
plt.legend()
plt.xlabel("Number of words")
plt.ylabel("Number of posts")
plt.title("Number of words per post")
plt.show()

In [None]:
print("Male:\n") 
print(postsDf[postsDf.op_gender=='M']['word_count'].describe())
print("\nFemale:\n")
print(postsDf[postsDf.op_gender=='W']['word_count'].describe())

**Finding**: While there are more male than female congresspeople, men and women have around the same average post word length, and the distributions of post word lengths look pretty similar for men and women. 

In [None]:
## number of posts with 1 word, and other common numbers of words
postsDf['word_count'].value_counts()[:10]

In [None]:
# exploration of the 1-word posts: many are NaN (presumably they only have a video, or some other media)
# some are hashtags or links 
postsDf[postsDf['word_count']==1]['post_text'].value_counts(dropna=False)

### Number of Responses Per Post

In [None]:
responses_grouped_by_post = responsesDf.groupby('post_id')

In [None]:
m_responses_grouped_by_post = responsesDf[responsesDf['op_gender']=='M'].groupby('post_id')
f_responses_grouped_by_post = responsesDf[responsesDf['op_gender']=='W'].groupby('post_id')

In [None]:
plt.hist(responses_grouped_by_post.size(), range=(0,100))
plt.xlabel("Number of responses")
plt.ylabel("Number of original posts")
plt.title("Number of responses per post")

In [None]:
# there are relatively few responses per post, with a long tail 
# (standard deviation is way higher than the mean )
responses_grouped_by_post.size().describe()

In [None]:
plt.hist(m_responses_grouped_by_post.size(), range=(0,100), bins=20, label='M')
plt.hist(f_responses_grouped_by_post.size(), range=(0,100), bins=20, label='F')
plt.legend()
plt.xlabel("Number of responses")
plt.ylabel("Number of original posts")
plt.title("Number of responses per post")
plt.show()

In [None]:
print("Male:\n")
print(m_responses_grouped_by_post.size().describe())
print("\nFemale:\n")
print(f_responses_grouped_by_post.size().describe())

**Findings**: Once again, remarkable parity between men and women here in terms of the distribution of number of responses per post. 

In [None]:
posts_w_number_responses = responses_grouped_by_post.size().reset_index()
posts_w_number_responses.columns = ['post_id', 'num_responses']
posts_w_number_responses = posts_w_number_responses.merge(postsDf, how='left', on='post_id')
posts_w_number_responses.head()

In [None]:
print("Total number of posts w/ any responses: {}".format(len(posts_w_number_responses)))

def print_posts_with_n_responses(n):
    posts_w_n_responses = posts_w_number_responses[posts_w_number_responses['num_responses'] > n]
    print("\nNumber of posts w/ > {} responses: {} ({:.4f})".format(n, len(posts_w_n_responses), len(posts_w_n_responses) / len(posts_w_number_responses)))
    print("Number of congress people with at least one post above > {} responses: {}".format(n, posts_w_n_responses.op_name.nunique()))
    print("Top 10:\n {}".format(posts_w_n_responses.op_name.value_counts()[:10]))
    return posts_w_n_responses
    
posts_w_100_r = print_posts_with_n_responses(100)
posts_w_1000_r = print_posts_with_n_responses(1000)
posts_w_10000_r = print_posts_with_n_responses(10000)
    

In [None]:
# the big-response posts are normally from well-known congress members, 
# plus they are talking about controversial topics (Trump, Betsy DeVos, Israel, Clinton, immigration)
# idk what to make of the #FridayPuppy stuff 

for i in range(29):
    print(posts_w_10000_r.iloc[i]['op_name'] + ": " + posts_w_10000_r.iloc[i]['post_text'] + "\n")

In [None]:
del responses_grouped_by_post
del m_responses_grouped_by_post
del f_responses_grouped_by_post

### Lengths of Responses

In [None]:
# distribution of word counts of congresspeople's posts
## TODO: make tokenization a little better here
def get_words(row):
    post = str(row['response_text'])
    return post.split()

def get_word_count(row):
    return len(row['words'])

In [None]:
del responsesDf

In [None]:
responses_dfs = []
chunk_size = 1000000

In [None]:
count = 0
for responsesDf in pd.read_csv(responses, chunksize=chunk_size):
    print("applying to df {}".format(count + 1))
    responsesDf['words'] = responsesDf.apply(get_words, axis=1)
    responsesDf = responsesDf.filter(items=['op_id', 'post_id', 'op_name', 'op_gender', 'words'])
    responsesDf['word_counts'] = responsesDf.apply(get_word_count, axis=1)
    responses_dfs.append(responsesDf)
    count += 1

In [None]:
lengths = []
m_lengths = []
f_lengths = []
for responseDf in responses_dfs:
    lengths.extend(responseDf.word_counts.values)
    m_lengths.extend(responseDf[responseDf['op_gender']=='M'].word_counts.values)
    f_lengths.extend(responseDf[responseDf['op_gender']=='W'].word_counts.values)

In [None]:
plt.hist(lengths, range=(0,100))

In [None]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.Series(lengths).describe()

In [None]:
lengths = np.array(lengths)
print("Number of responses with length > 100: {}".format(sum(lengths > 100)))
print("Number of responses with length > 1000: {}".format(sum(lengths > 1000)))

In [None]:
plt.hist(m_lengths, range=(0,100), bins=20, label='M')
plt.hist(f_lengths, range=(0,100), bins=20, label='F')
plt.legend()
plt.show()

In [None]:
m_lengths = np.array(m_lengths)
f_lengths = np.array(f_lengths)

print(pd.Series(m_lengths).describe())
print()
print(pd.Series(f_lengths).describe())

In [None]:
# TODO: tf-idf

In [None]:
del responses_dfs

In [None]:
responsesDf = pd.read_csv(responses)

In [None]:
del responsesDf

In [None]:
corpus = corpus.astype('U')

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus)

### References to known entities

In [51]:
responsesDf.shape

(13866507, 7)

In [16]:
unique_congresspeople = unique_congresspeople.op_name.values

In [52]:
full_name_to_tokens = {}

def get_tokens_from_full_name(full_name):
    all_tokens = []
    words_in_name = full_name.split()
    all_tokens.extend(words_in_name)
    all_tokens.extend([word.lower() for word in words_in_name])
    full_name_to_tokens[full_name] = all_tokens
    
for congress_member in unique_congresspeople:
    get_tokens_from_full_name(congress_member)

In [53]:
def find_full_name_occurrences_in_response(row):
    if row['op_name'].lower() in str(row['response_text']).lower():
        return 1
    else:
        return 0
    
responsesDf['full_name_in_response_indicator'] = responsesDf.apply(find_full_name_occurrences_in_response, axis=1)

In [54]:
def find_name_token_occurrences(row):
    if row['full_name_in_response_indicator'] != 1:
        tokens_to_check = full_name_to_tokens[row['op_name']]
        if any(t in str(row['response_text']) for t in tokens_to_check):
            return 1
        else:
            return 0 
    else:
        return 0 
    
responsesDf['name_token_in_response_indicator'] = responsesDf.apply(find_name_token_occurrences, axis=1)

In [55]:
def find_gendered_title(row):
    word_to_look_for = 'congressman'
    if row['op_gender']=='W':
        word_to_look_for = 'congresswoman'
    if word_to_look_for in str(row['response_text']).lower():
        return 1
    else:
        return 0 
    
responsesDf['gendered_title_indicator'] = responsesDf.apply(find_gendered_title, axis=1)

In [56]:
def has_at_least_one_name_indicator(row):
    all_inds = [row['gendered_title_indicator'], row['name_token_in_response_indicator'], row['full_name_in_response_indicator']]
    if any(ind==1 for ind in all_inds):
        return 1
    else:
        return 0
    
responsesDf['any_indicators'] = responsesDf.apply(has_at_least_one_name_indicator, axis=1)

In [59]:
print("Proportion of responses with poster full name: {:.3f}".format(responsesDf.full_name_in_response_indicator.value_counts()[1] / 
                                                                 len(responsesDf)))
print("Proportion of responses with part of poster name: {:.3f}".format(responsesDf.name_token_in_response_indicator.value_counts()[1] / 
                                                                 len(responsesDf)))
print("Proportion of responses with Congressman/woman: {:.3f}".format(responsesDf.gendered_title_indicator.value_counts()[1] / 
                                                                 len(responsesDf)))
print("Proportion of responses with at least one of the above: {:.3f}".format(responsesDf.any_indicators.value_counts()[1] / 
                                                                 len(responsesDf)))

Proportion of responses with poster full name: 0.039
Proportion of responses with part of poster name: 0.152
Proportion of responses with Congressman/woman: 0.017
Proportion of responses with at least one of the above: 0.200


## Annotations

In [None]:
annotations = "rtgender//annotations.csv"

In [None]:
annotationsDf = pd.read_csv(annotations)

In [None]:
annotationsDf = annotationsDf[annotationsDf['source']=='facebook_congress']

In [None]:
annotationsDf.head()

In [None]:
annotationsDf.sentiment.value_counts()

In [None]:
annotationsDf.relevance.value_counts()