# BLM Activity and Sentiment Analysis

Steps:

1. Divide each major period into 3 sub-periods using change point analysis. This was performed in `tweet_counts.ipynb`.
2. For each period...
   * Process all tweets 
      * Process each tweet through `TweetsManager.process_tweet()`
      * Call `TweetsManager.process_deferred_interactions()` to handle retweet/reply corner cases.
      * Call `TweetsManager.analyze_graph()` to detect communities.
   * Discard too-small communities.
   * Assign BLM stance to communities.
      * Generate reports on largest communities.
      * Manually assign stance to those communities.
      * Build/use prediction model for remaining communities.
   * Save everything.

In [5]:
from   collections import Counter
import json
from   os import path
import pandas as pd
import re
from   string import Template
import numpy as np

from elasticsearch import Elasticsearch as ES
from elasticsearch.helpers import scan

from blm_activity_db import BlmActivityDb
from community_classifier import get_blm_classifier
from community_report import generate_init_community_report
from tweet_mgr import TweetsManager, CommunityActivity, Stance
from tweet_sentiment import EmoScores, PronounCounts, SentimentAnalysis


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Configurable parameters
start_date = "2014-08-09"
end_date = "2014-12-26"
es_idx = 'tweets'
# Template for query_body
#query_body = {
#  "query": {
#      "range": {
#          "doc.created_at": {
#            "gte": "Sun Nov 23 00:00:00 +0000 2014",
#            "lt": "Wed Dec 24 00:00:00 +0000 2014"
#          }
#      }
#  }
#}

query_body = {
  "query": {
      "range": {
          "doc.created_at": {
              "lt": "Sat Nov 22 00:00:00 +0000 2014"
          }
      }
  }
}

period = 1
num_init_communities = 25
num_exemplars = 10



In [7]:
tm = TweetsManager()

es = ES(hosts=["localhost"])
scan_iter = scan(es, index=es_idx, query=query_body)
for result in scan_iter:
    tweet = result['_source']
    tm.process_tweet(tweet)

In [8]:
def get_tweet_text_by_id(id_):
    doc = es.get(index=es_idx, id=id_)
    return doc['_source']['doc']['text']

In [9]:
tm.process_deferred_interactions()
tm.analyze_graph(n_iterations=10)

In [10]:
community_size = Counter() # num_accounts -> num of communities having that size 
unique_tweets = Counter() # num_unique_tweets -> num of communities with that activity
for _, accounts in tm.community_user_map.items():
    community_size[len(accounts)] += 1
for c_activity in tm.community_activity_map.values():
    num_unique_tweets = c_activity.num_tweets - len(c_activity.retweet_sentiment_analyses)
    unique_tweets[num_unique_tweets] += 1


In [11]:
total_unique_tweets = {k: k*v for k, v in unique_tweets.items()}
for k, v in total_unique_tweets.items():
    print(k, ": ", v)

5708 :  5708
4524 :  4524
4096 :  4096
1342 :  1342
2272 :  2272
1652 :  1652
3337 :  3337
3632 :  3632
1120 :  1120
1353 :  1353
1841 :  1841
608 :  608
661 :  661
769 :  769
749 :  749
867 :  867
660 :  660
307 :  307
156 :  312
541 :  541
276 :  276
212 :  212
209 :  209
269 :  269
557 :  557
452 :  452
279 :  279
649 :  649
229 :  229
366 :  366
641 :  641
267 :  267
607 :  607
1123 :  1123
220 :  220
138 :  138
309 :  309
180 :  180
135 :  135
227 :  227
195 :  195
50 :  50
55 :  55
191 :  191
108 :  108
17 :  17
4 :  72
6 :  42
7 :  84
12 :  36
1 :  413
2 :  232
5 :  65
8 :  24
3 :  138
9 :  45
14 :  14
18 :  18
10 :  20


In [12]:
print("total tweets:", len(tm.tweets))
print("total unique tweets:", sum(v for v in total_unique_tweets.values()))
print(len(tm.user_community_map), "accounts in", len(tm.community_user_map), "communities")
print("If 25 used as threshold for unique tweets per community...")
num_communities = sum(v for k, v in unique_tweets.items() if k >= 25)
num_unique_tweets = sum(v for k, v in total_unique_tweets.items() if k >= 25)
print(num_communities, "communities and", num_unique_tweets, "unique tweets remain.")

total tweets: 89404
total unique tweets: 45515
27018 accounts in 687 communities
If 25 used as threshold for unique tweets per community...
46 communities and 44295 unique tweets remain.


In [13]:
tm.filter_low_activity_communities(unique_tweets_threshold=25)

In [14]:
# Get Initial Report
report_dir = f'../data/Reports/{period}/'
report_name = "Largest Communities Hashtags and Tweets"
report_path = report_dir + f"{report_name}.md"

report = f"# {report_name} in Period {period}\n\n"
# Add section for each of top num_init_communities by membership count
comm_user_counts = sorted(tm.community_user_map.items(), key=lambda x: len(x[1]), reverse=True)
for k, (comm_id, members) in enumerate(comm_user_counts):
    if k == num_init_communities:
        break
    num_members = len(members)
    ca: CommunityActivity = tm.community_activity_map[comm_id]
    num_tweets = ca.num_tweets
    num_retweets = sum(count for count in ca.retweet_counter.values())
    hashtags = []
    ht_counts = []
    meme_counts = sorted(ca.meme_counter.items(), key=lambda x:x[1], reverse=True)
    for i, (tag, count) in enumerate(meme_counts):
        if i == num_exemplars:
            break
        hashtags.append(tag)
        ht_counts.append(count)
    tweet_ids = []
    rt_counts = []
    retweet_counts = sorted(ca.retweet_counter.items(), key=lambda x:x[1], reverse=True)
    for i, (tweet_id, count) in enumerate(retweet_counts):
        if i == num_exemplars:
            break
        tweet_ids.append(tweet_id)
        rt_counts.append(count)
    rts = []
    for id_ in tweet_ids:
        if id_ in tm.tweets:
            rts.append(tm.tweets[id_])
        else:
            rts.append(get_tweet_text_by_id(id_))
    report += generate_init_community_report(
        comm_id,
        num_members,
        num_tweets,
        num_retweets,
        hashtags, 
        ht_counts, 
        rts, 
        rt_counts,
)
with open(report_path, 'w', encoding="utf-8") as f:
    f.write(report)

In [15]:
counter_comm_ids = [12]

In [16]:
blm_comm_ids = [i for i in range(num_init_communities) if not i in counter_comm_ids]

In [17]:
   
def get_tweet_texts(tweet_ids, tm):
    texts = []
    for id_ in tweet_ids:
        if id_ in tm.tweets:
            texts.append(tm.tweets[id_])
        else:
            texts.append(get_tweet_text_by_id(id_))
    return texts

blm_retweet_set = set()
counter_retweet_set = set()
for k, (comm_id, _ ) in enumerate(comm_user_counts):
    if k == num_init_communities:
        break
    num_exemplars = 288 if comm_id in counter_comm_ids else 18
    retweet_counts = sorted(
        tm.community_activity_map[comm_id].retweet_counter.items(), 
        key=lambda x:x[1], 
        reverse=True
    )
    for i, (tweet_id, _ ) in enumerate(retweet_counts):
        if i == num_exemplars:
            break
        if comm_id in counter_comm_ids:
            counter_retweet_set.add(tweet_id)
        else:
            blm_retweet_set.add(tweet_id)
blm_retweets = get_tweet_texts(blm_retweet_set, tm)
counter_retweets = get_tweet_texts(counter_retweet_set, tm)
blm_clf, cv_results = get_blm_classifier(blm_retweets, counter_retweets)    

In [18]:
results_df = pd.DataFrame(cv_results)
print(results_df)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       1.059491      0.071854         0.259808        0.016334   
1       1.091161      0.019412         0.269096        0.006451   
2       1.078376      0.012611         0.270996        0.006456   

  param_model__alpha param_model__fit_prior  \
0                  1                  False   
1                  1                  False   
2                  1                  False   

                               param_vec__stop_words  \
0          [rt, #blacklivesmatter, blacklivesmatter]   
1  [rt, blacklivesmatter, #blacklivesmatter, i, m...   
2  [i, me, my, myself, we, our, ours, ourselves, ...   

                                              params  split0_test_score  \
0  {'model__alpha': 1.0, 'model__fit_prior': Fals...           0.833333   
1  {'model__alpha': 1.0, 'model__fit_prior': Fals...           0.854167   
2  {'model__alpha': 1.0, 'model__fit_prior': Fals...           0.854167   

   split1_test_s

In [19]:
num_samples_per_comm = 25
unknown_comm_ids = []
for k, (comm_id, _ ) in enumerate(comm_user_counts):
    if k < num_init_communities:
        continue
    sample_tweet_ids = []
    counts = []
    retweet_counts = sorted(
        tm.community_activity_map[comm_id].retweet_counter.items(), 
        key=lambda x:x[1], 
        reverse=True
    )
    for i, (tweet_id, count) in enumerate(retweet_counts):
        if i == num_samples_per_comm:
            break
        sample_tweet_ids.append(tweet_id)
        counts.append(count)
    if sum(counts) < num_samples_per_comm:
        unknown_comm_ids.append(comm_id)
        continue
    sample_tweets = get_tweet_texts(sample_tweet_ids, tm)
    blm_predictions = blm_clf.predict(sample_tweets)
    weighted_sum = np.dot(np.array(counts), np.array(blm_predictions))
    blm_stance_probability = weighted_sum / sum(counts)
    if blm_stance_probability < 0.40:
        counter_comm_ids.append(comm_id)
    elif blm_stance_probability > 0.60:
        blm_comm_ids.append(comm_id)
    else:
        unknown_comm_ids.append(comm_id)

In [20]:
print(f"counter communities: {counter_comm_ids}")
print(f"{len(unknown_comm_ids)} unknown communities")
print(f"{len(blm_comm_ids)} BLM communities")

counter communities: [12]
0 unknown communities
45 BLM communities


In [21]:
for id_ in counter_comm_ids:
    tm.community_activity_map[id_].stance = Stance.CounterProtest
for id_ in unknown_comm_ids:
    tm.community_activity_map[id_].stance = Stance.Unknown
for id_ in blm_comm_ids:
    tm.community_activity_map[id_].stance = Stance.Protest

In [22]:
db = BlmActivityDb(initialize_db=True) #TODO - use default False on subsequent periods
db.save_tweets_mgr(tm, period)


In [28]:
movement_template = Template('''
## MOVEMENT $movement

Communities: $num_communities  
Members: $num_members  
Retweets: $num_retweets  
Tweets: $num_tweets

### Top Hashtags

| Count | Hashtag |
|------:|:------|
$hashtag_list

### Top Retweets

| Count | Tweet |
|------:|:------|
$retweet_list

### Sentiment

All Tweet Polarity = $at_polarity  
Retweet Polarity = $rt_polarity

### Emotions

| Emotion | All Tweets | Retweets |
|------:|:------:|:-------|
$emo_list

### Pronoun Usage

| Person | All Tweets | Retweets |
|------:|:------:|:-------|
| First Singular | $first_singular_at | $first_singular_rt |
| First Plural | $first_plural_at | $first_plural_rt |
| Second | $second_at | $second_rt |
| Third | $third_at | $third_rt |

''')

def store_movement_reports(movement, report_dir, comm_ids, tm):
    '''Write files with salient data on BLM or counter movement during a period
    
    Parameters:
    -----------
    movement : str
        "BLM" or "Counter"
    report_dir : str
        FS directory where reports are to be written
    comm_ids : list of int
        IDs of communities in movement
    tm : TweetManager instance
    '''
    # movement stats
    ## counts
    num_examples = 25
    num_communities = len(comm_ids)
    if num_communities == 0:
        return
    num_members = 0
    total_tweets = 0
    total_retweets = 0
    meme_counter = Counter()
    retweet_counter = Counter()
    retweet_pc, retweet_emo, retweet_sentiment = PronounCounts(), EmoScores(), 0.0
    all_tweet_pc, all_tweet_emo, all_tweet_sentiment = PronounCounts(), EmoScores(), 0.0
    for community_id in comm_ids:
        num_members += len(tm.community_user_map[community_id])
        c_activity = tm.community_activity_map[community_id]
        total_tweets += c_activity.num_tweets
        for tweet_id, count in c_activity.retweet_counter.items():
            total_retweets += count
            retweet_counter[tweet_id] += count
        for meme, count in c_activity.meme_counter.items():
            meme_counter[meme] += count
        rss = c_activity.retweet_sentiment_summary
        num_retweets = len(c_activity.retweet_sentiment_analyses)
        retweet_pc += rss.pronoun_counts * num_retweets
        retweet_emo += rss.emo_scores * num_retweets
        retweet_sentiment += rss.sentiment * num_retweets
        atss = c_activity.all_sentiment_summary
        all_tweet_pc += atss.pronoun_counts * c_activity.num_tweets
        all_tweet_emo += atss.emo_scores * c_activity.num_tweets
        all_tweet_sentiment += atss.sentiment * c_activity.num_tweets
    
    retweet_pc /= total_retweets
    retweet_emo /= total_retweets
    retweet_sentiment /= total_retweets
    all_tweet_pc /= total_tweets
    all_tweet_emo /= total_tweets
    all_tweet_sentiment /= total_tweets

    ## 25 most important hashtags             
    top_memes = sorted(meme_counter.items(), key = lambda x: x[1], reverse = True)
    hashtag_list = ""
    for i, (ht, count) in enumerate(top_memes):
        if i == num_examples:
            break
        hashtag_list += f"| {count} | {ht} |\n"
    
    ## 25 most retweeted
    top_retweets = sorted(retweet_counter.items(), key = lambda x: x[1], reverse = True)
    retweet_list = ""
    line_feeds = re.compile("[\r\n]")
    for i, (tweet_id, count) in enumerate(top_retweets):
        if i == num_examples:
            break
        tweet = get_tweet_text_by_id(tweet_id)
        tweet = line_feeds.sub('', tweet)
        retweet_list += f"| {count} | {tweet} |\n"
    
    ## emotions
    emo_list = ""
    emo_list += f"| trust | {round(all_tweet_emo.trust, 3)} | {round(retweet_emo.trust, 3)} |\n"
    emo_list += f"| anticipation | {round(all_tweet_emo.anticipation, 3)} | {round(retweet_emo.anticipation, 3)} |\n"
    emo_list += f"| joy | {round(all_tweet_emo.joy, 3)} | {round(retweet_emo.joy, 3)} |\n"
    emo_list += f"| surprise | {round(all_tweet_emo.surprise, 3)} | {round(retweet_emo.surprise, 3)} |\n"
    emo_list += f"| anger | {round(all_tweet_emo.anger, 3)} | {round(retweet_emo.anger, 3)} |\n"
    emo_list += f"| disgust | {round(all_tweet_emo.disgust, 3)} | {round(retweet_emo.disgust, 3)} |\n"
    emo_list += f"| fear | {round(all_tweet_emo.fear, 3)} | {round(retweet_emo.fear, 3)} |\n"
    emo_list += f"| sadness | {round(all_tweet_emo.sadness, 3)} | {round(retweet_emo.sadness, 3)} |\n"

    
    ## Write to file
    subs = {
        "movement": movement,
        "num_communities": num_communities,
        "num_members": num_members,
        "num_tweets": total_tweets,
        "num_retweets": total_retweets,
        "hashtag_list": hashtag_list,
        "retweet_list": retweet_list,
        "at_polarity": round(all_tweet_sentiment, 3),
        "rt_polarity": round(retweet_sentiment, 3),
        "emo_list": emo_list,
        "first_singular_at": round(all_tweet_pc.first_singular, 3),
        "first_singular_rt": round(retweet_pc.first_singular, 3),
        "first_plural_at": round(all_tweet_pc.first_plural, 3),
        "first_plural_rt": round(retweet_pc.first_plural, 3),
        "second_at": round(all_tweet_pc.second, 3),
        "second_rt": round(retweet_pc.second, 3),
        "third_at": round(all_tweet_pc.third, 3),
        "third_rt": round(retweet_pc.third, 3),
    }    
    movement_summary = movement_template.safe_substitute(subs)
    report_name = f"{movement}_summary.md"
    report_path = path.join(report_dir, report_name)
    with open(report_path, 'w', encoding="utf-8") as f:
        f.write(movement_summary)
        
store_movement_reports("Counter", report_dir, counter_comm_ids, tm)
store_movement_reports("BLM", report_dir, blm_comm_ids, tm)
store_movement_reports("Unknown", report_dir, unknown_comm_ids, tm)


In [31]:
# serialize the graph
graph_file_name = "graph.pkl"
dir_stem = "D:/BLM-db/graphs/" + f"{period}/"  
graph_file_path = path.join(dir_stem, graph_file_name)
tm.urg.g.write_pickle(graph_file_path, version = -1)

In [20]:
# overview report
overview_template = Template('''
## OVERVIEW of PERIOD $start_date to $end_date

| What  | How Many |
|:-------|--------:|
| Tweets | $num_tweets |
| Retweets | $num_retweets |  
| Communities | $num_communities |  
| Accounts | $num_accounts |
| Size of largest community | $largest_comm_size |

''')
num_tweets = sum(tm.user_tweet_counter.values())
num_retweets = sum(tm.user_retweeted_frequency.values())
num_communities = len(tm.community_user_map)
num_accounts = len(tm.user_tweet_counter)
largest_comm_size = len(tm.community_user_map[0])

subs = {
    'start_date': start_date,
    'end_date': end_date,
    'num_tweets': num_tweets,
    'num_retweets': num_retweets,
    'num_communities': num_communities,
    'num_accounts': num_accounts,
    'largest_comm_size': largest_comm_size,
}
overview_report_name = "OverviewReport.md"
overview_path = path.join(report_dir, overview_report_name)
overview = overview_template.safe_substitute(subs)
with open(overview_path, 'w', encoding="utf-8") as f:
    f.write(overview)


In [21]:
# inter-community dialog report
blm_member_df.columns

Index(['community_id', 'user_id', 'tweet_count'], dtype='object')

In [22]:
size_df = blm_member_df[["community_id", "user_id"]].rename(columns={'user_id':'size'}).groupby('community_id').count()
tweet_count_df = blm_member_df[["community_id", "tweet_count"]].groupby('community_id').sum()
blm_comm_df = size_df.merge(tweet_count_df, on="community_id")
blm_comm_df["avg_tweets"] = blm_comm_df["tweet_count"]/blm_comm_df["size"]
blm_comm_df["internal_retweets"] = list(map(lambda x: sum(tm.community_retweet_counter[x].values()), blm_comm_ids))
blm_comm_df["retweet_pct"] = blm_comm_df["internal_retweets"]/blm_comm_df["tweet_count"]
blm_comm_df

Unnamed: 0_level_0,size,tweet_count,avg_tweets,internal_retweets,retweet_pct
community_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,86219,620870,7.201081,370870,0.597339
2,7073,12668,1.791036,12422,0.980581
3,5433,8224,1.513712,5836,0.709630
4,3759,6291,1.673583,4058,0.645048
5,3758,5338,1.420436,3868,0.724616
...,...,...,...,...,...
91,170,530,3.117647,251,0.473585
92,145,260,1.793103,161,0.619231
142,17,162,9.529412,58,0.358025
770,5,66,13.200000,35,0.530303


In [23]:
reply_to_cp_counter = Counter()
reply_from_cp_counter = Counter()
for comm_pair in tm.inter_comm_reply_counter:
    replying_comm = comm_pair[0]
    reply_to_comm = comm_pair[1]
    if reply_to_comm in counter_comm_ids and replying_comm in counter_comm_ids:
        continue
    if reply_to_comm in counter_comm_ids:
        reply_to_cp_counter[replying_comm] += tm.inter_comm_reply_counter[comm_pair]
    if replying_comm in counter_comm_ids:
        reply_from_cp_counter[reply_to_comm] += tm.inter_comm_reply_counter[comm_pair]
        
blm_comm_df["replies_to_cp"] = list(map(lambda x: reply_to_cp_counter[x], blm_comm_ids))
blm_comm_df["replies_from_cp"] = list(map(lambda x: reply_from_cp_counter[x], blm_comm_ids))
blm_comm_df

Unnamed: 0_level_0,size,tweet_count,avg_tweets,internal_retweets,retweet_pct,replies_to_cp,replies_from_cp
community_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,86219,620870,7.201081,370870,0.597339,0,0
2,7073,12668,1.791036,12422,0.980581,0,2
3,5433,8224,1.513712,5836,0.709630,4,2
4,3759,6291,1.673583,4058,0.645048,17,3
5,3758,5338,1.420436,3868,0.724616,0,3
...,...,...,...,...,...,...,...
91,170,530,3.117647,251,0.473585,2,7
92,145,260,1.793103,161,0.619231,0,0
142,17,162,9.529412,58,0.358025,0,0
770,5,66,13.200000,35,0.530303,0,0


In [24]:
comm_retweeted_counter = Counter()
for comm_pair in tm.inter_comm_retweet_counter:
    retweeted_comm = comm_pair[1]
    comm_retweeted_counter[retweeted_comm] += tm.inter_comm_retweet_counter[comm_pair]
blm_comm_df["retweeted_external"] = list(map(lambda x: comm_retweeted_counter[x], blm_comm_ids))
blm_comm_df    

Unnamed: 0_level_0,size,tweet_count,avg_tweets,internal_retweets,retweet_pct,replies_to_cp,replies_from_cp,retweeted_external
community_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,86219,620870,7.201081,370870,0.597339,0,0,30154
2,7073,12668,1.791036,12422,0.980581,0,2,287
3,5433,8224,1.513712,5836,0.709630,4,2,2404
4,3759,6291,1.673583,4058,0.645048,17,3,2648
5,3758,5338,1.420436,3868,0.724616,0,3,2102
...,...,...,...,...,...,...,...,...
91,170,530,3.117647,251,0.473585,2,7,61
92,145,260,1.793103,161,0.619231,0,0,35
142,17,162,9.529412,58,0.358025,0,0,5
770,5,66,13.200000,35,0.530303,0,0,0


In [25]:
def is_blm_retweeting_cp(x):
    return x[0] in blm_comm_ids and x[1] in counter_comm_ids
communities_retweeting_cp = list(filter(is_blm_retweeting_cp, tm.inter_comm_retweet_counter.keys()))
cp_retweet_counts = Counter()
for pair in communities_retweeting_cp:
    retweet_comm = pair[0]
    cp_retweet_counts[retweet_comm] += tm.inter_comm_retweet_counter[pair]
blm_comm_df["cp_retweets"] = [cp_retweet_counts[c] for c in blm_comm_ids]
blm_comm_df["cp_retweet_pct"] = blm_comm_df["cp_retweets"] / (blm_comm_df["cp_retweets"] + blm_comm_df["internal_retweets"])
blm_comm_df

Unnamed: 0_level_0,size,tweet_count,avg_tweets,internal_retweets,retweet_pct,replies_to_cp,replies_from_cp,retweeted_external,cp_retweets,cp_retweet_pct
community_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,86219,620870,7.201081,370870,0.597339,0,0,30154,956,0.002571
2,7073,12668,1.791036,12422,0.980581,0,2,287,0,0.000000
3,5433,8224,1.513712,5836,0.709630,4,2,2404,5,0.000856
4,3759,6291,1.673583,4058,0.645048,17,3,2648,7,0.001722
5,3758,5338,1.420436,3868,0.724616,0,3,2102,6,0.001549
...,...,...,...,...,...,...,...,...,...,...
91,170,530,3.117647,251,0.473585,2,7,61,0,0.000000
92,145,260,1.793103,161,0.619231,0,0,35,0,0.000000
142,17,162,9.529412,58,0.358025,0,0,5,0,0.000000
770,5,66,13.200000,35,0.530303,0,0,0,0,0.000000


In [26]:
report_name = "BLM_Community_Summary.csv"
report_path = path.join(report_dir, report_name)
blm_comm_df.to_csv(report_path)

In [None]:
# Configurable parameters
start_date = "2014-11-23"
end_date = "2014-12-23"
es_idx = 'tweets'
query_body = {
  "query": {
      "range": {
          "doc.created_at": {
            "gte": "Sun Nov 23 00:00:00 +0000 2014",
            "lt": "Wed Dec 24 00:00:00 +0000 2014"
          }
      }
  }
}
period = 2
num_init_communities = 25
num_exemplars = 10



In [2]:
# Configurable parameters
start_date = "2012-08-20"
end_date = "2014-11-22"
es_idx = 'tweets'
query_body = {
  "query": {
      "range": {
          "doc.created_at": {
            "lt": "Sun Nov 23 00:00:00 +0000 2014"
          }
      }
  }
}
period = 1
num_init_communities = 25
num_exemplars = 10

