In [1]:
import pandas as pd
import sqlite3 as sql 

from blm_activity_db import BlmActivityDb

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
blm_retweeting_query = \
"""
WITH BlmCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = 1
), CounterCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = -1
)
SELECT
    PeriodId,
    RetweetingAccountId,
    RetweetedAccountId,
    RetweetingCommunityId,
    RetweetedCommunityId,
    NumRetweets
FROM 
    InterCommunityRetweet cr
WHERE 
    PeriodId = ?
AND
    RetweetingCommunityId in BlmCommunities
AND
    RetweetedCommunityId in CounterCommunities"""
counter_retweeting_query = \
"""
WITH BlmCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = 1
), CounterCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = -1
)
SELECT
    PeriodId,
    RetweetingAccountId,
    RetweetedAccountId,
    RetweetingCommunityId,
    RetweetedCommunityId,
    NumRetweets
FROM 
    InterCommunityRetweet
WHERE 
    PeriodId = ?
AND
    RetweetingCommunityId in CounterCommunities
AND
    RetweetedCommunityId in BlmCommunities"""
cols = ["PeriodId", "RetweetingAccountId", "RetweetedAccountId", "RetweetingCommunityId", "RetweetedCommunityId", "NumTweets"]
def get_interstance_retweets_for_period(period: int):
    db = BlmActivityDb()
    with db.conn as conn:
        cur = conn.cursor()
        cur.execute(blm_retweeting_query, (period, period, period))
        blm_retweets = cur.fetchall()
        cur.execute(counter_retweeting_query, (period, period, period))
        counter_retweets= cur.fetchall()
    activist_retweets = pd.DataFrame(blm_retweets, columns = cols) 
    counter_retweets = pd.DataFrame(counter_retweets, columns = cols)
    return activist_retweets, counter_retweets


In [6]:
activist_retweeting_df, counter_retweeting_df = None, None
first_period, last_period = 1, 6
for i in range(first_period, last_period + 1):
    activist_retweets, counter_retweets = get_interstance_retweets_for_period(i)
    if activist_retweeting_df is None:
        activist_retweeting_df = activist_retweets
        counter_retweeting_df = counter_retweets
    else:
        activist_retweeting_df = pd.concat([activist_retweeting_df, activist_retweets], ignore_index = True)
        counter_retweeting_df = pd.concat([counter_retweeting_df, counter_retweets], ignore_index = True)

In [8]:
counter_retweeting_df

Unnamed: 0,PeriodId,RetweetingAccountId,RetweetedAccountId,RetweetingCommunityId,RetweetedCommunityId,NumTweets
0,1,1067344104,265523983,12,7,1
1,1,172926910,101875332,12,1,1
2,1,172926910,88279736,12,10,1
3,1,190932580,63144098,12,2,1
4,1,2227552357,51173311,12,4,1
...,...,...,...,...,...,...
5771,6,996222800024227845,3721046544,0,77,1
5772,6,996906889584762880,373157754,0,6,1
5773,6,997168457857855488,18479513,0,29,1
5774,6,997168457857855488,958428211985166336,0,92,1


In [54]:
query = \
"""
WITH UnknownCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = 0
)
SELECT
    "Retweeting" as Direction,
    NumRetweets
FROM 
    InterCommunityRetweet
WHERE 
    PeriodId = ?
AND
    RetweetingCommunityId in UnknownCommunities
AND
    RetweetedCommunityId not in UnknownCommunities
UNION
SELECT
    "Retweeted" as Direction,
    NumRetweets
FROM 
    InterCommunityRetweet
WHERE 
    PeriodId = ?
AND
    RetweetedCommunityId in UnknownCommunities
AND
    RetweetingCommunityId not in UnknownCommunities
"""
db = BlmActivityDb()
unknown_retweets = []
for i in range(1, 7):
    with db.conn as conn:
        cur = conn.cursor()
        cur.execute(query, (i, i, i))
        unknown_retweets.extend(cur.fetchall())
unknown_retweets = pd.DataFrame(unknown_retweets, columns = ["Direction", "NumRetweets"]) 


In [55]:
len(unknown_retweets)

9

In [33]:
activist_retweeting_summary = activist_retweeting_df[["PeriodId", "RetweetingAccountId", "NumTweets"]]\
    .groupby(["PeriodId", "RetweetingAccountId"], as_index=False)\
    .sum()\
    .sort_values(by=["RetweetingAccountId", "PeriodId"])


In [32]:
activist_retweeting_summary

Unnamed: 0,PeriodId,RetweetingAccountId,NumTweets
3886,6,1000145114,1
1663,5,1000924337082388481,5
3887,6,1000924337082388481,7
1664,5,1000963649924665344,1
3888,6,100136328,1
...,...,...,...
6268,6,998677512103264256,1
6269,6,998998809815408641,1
6270,6,99989453,1
792,2,999906110,1


In [35]:
len(activist_retweeting_df), activist_retweeting_df.NumTweets.sum() 
# 7461. Therefore, number of accounts retweeting multiple counter-protest accounts during a period ~= 1190 (7461 - 6271)

(7461, 7461)

In [41]:
activist_periods_df = activist_retweeting_summary[["PeriodId", "RetweetingAccountId"]]\
    .groupby("RetweetingAccountId", as_index=False)\
    .count()
activist_periods_df

Unnamed: 0,RetweetingAccountId,PeriodId
0,1000145114,1
1,1000924337082388481,2
2,1000963649924665344,1
3,100136328,1
4,1001413907318296576,1
...,...,...
6147,998677512103264256,1
6148,998998809815408641,1
6149,99989453,1
6150,999906110,1


In [44]:
activist_periods_df.groupby(["PeriodId"], as_index=False).count()

Unnamed: 0,PeriodId,RetweetingAccountId
0,1,6034
1,2,117
2,3,1


In [45]:
counter_retweeting_summary = counter_retweeting_df[["PeriodId", "RetweetingAccountId", "NumTweets"]]\
    .groupby(["PeriodId", "RetweetingAccountId"], as_index=False)\
    .sum()\
    .sort_values(by=["RetweetingAccountId", "PeriodId"])


In [48]:
counter_retweeting_df.NumTweets.unique()

array([1], dtype=object)

In [50]:
len(counter_retweeting_df), len(counter_retweeting_summary)
# 5776 - 4446 ~= 1330 that retweeted two different accounts from activist community

(5776, 4446)

In [52]:
counter_periods_df = counter_retweeting_summary[["PeriodId", "RetweetingAccountId"]]\
    .groupby("RetweetingAccountId", as_index=False)\
    .count()
counter_periods_df.groupby(["PeriodId"], as_index=False).count()

Unnamed: 0,PeriodId,RetweetingAccountId
0,1,4080
1,2,177
2,3,4


In [75]:
# these queries would be bolloxed if any account retweeted cross-stance more than once in a period. But they never did.
blm_retweeting_query = \
"""
WITH BlmCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = 1
), CounterCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = -1
)
SELECT
    cr.PeriodId,
    RetweetingAccountId,
    RetweetedAccountId,
    RetweetingCommunityId,
    RetweetedCommunityId,
    NumTweets,
    a.NumRetweets,
    cr.NumRetweets as CrossStanceRetweets
FROM 
    InterCommunityRetweet cr
JOIN
    AccountActivity a
ON
    cr.RetweetingAccountId = a.AccountId
AND
    cr.PeriodId = a.PeriodId
WHERE 
    cr.PeriodId = ?
AND
    RetweetingCommunityId in BlmCommunities
AND
    RetweetedCommunityId in CounterCommunities"""
counter_retweeting_query = \
"""
WITH BlmCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = 1
), CounterCommunities as (
    SELECT CommunityId
    FROM Community
    WHERE PeriodId = ? and Stance = -1
)
SELECT
    cr.PeriodId,
    RetweetingAccountId,
    RetweetedAccountId,
    RetweetingCommunityId,
    RetweetedCommunityId,
    NumTweets,
    a.NumRetweets,
    cr.NumRetweets as CrossStanceRetweets
FROM 
    InterCommunityRetweet cr
JOIN
    AccountActivity a
ON
    cr.RetweetingAccountId = a.AccountId
AND
    cr.PeriodId = a.PeriodId
WHERE 
    cr.PeriodId = ?
AND
    RetweetingCommunityId in CounterCommunities
AND
    RetweetedCommunityId in BlmCommunities"""
cols = [
    "PeriodId", 
    "RetweetingAccountId", 
    "RetweetedAccountId", 
    "RetweetingCommunityId", 
    "RetweetedCommunityId", 
    "NumTweets", 
    "NumRetweets",
    "CrossStanceRetweets"
]
def get_interstance_retweets_for_period(period: int):
    db = BlmActivityDb()
    with db.conn as conn:
        cur = conn.cursor()
        cur.execute(blm_retweeting_query, (period, period, period))
        blm_retweets = cur.fetchall()
        cur.execute(counter_retweeting_query, (period, period, period))
        counter_retweets= cur.fetchall()
    a_retweets = pd.DataFrame(blm_retweets, columns = cols) 
    c_retweets = pd.DataFrame(counter_retweets, columns = cols)
    return a_retweets, c_retweets


In [76]:
a_retweets = None
c_retweets = None
for i in range(1, 7):
    a, c = get_interstance_retweets_for_period(i)
    if a_retweets is None:
        a_retweets = a
        c_retweets = c
    else:
        a_retweets = pd.concat([a_retweets, a], ignore_index = True)
        c_retweets = pd.concat([c_retweets, c], ignore_index = True)


In [77]:
len(a_retweets), len(c_retweets)

(7461, 5776)

In [91]:
a_retweets = a_retweets[['PeriodId', 'NumRetweets', 'CrossStanceRetweets']].groupby("PeriodId", as_index=False).sum()
a_retweets

Unnamed: 0,PeriodId,NumRetweets,CrossStanceRetweets
0,1,507,24
1,2,49315,844
2,3,194715,1124
3,5,10950,2286
4,6,75415,3183


In [92]:
c_retweets = c_retweets[['PeriodId', 'NumRetweets', 'CrossStanceRetweets']].groupby("PeriodId", as_index=False).sum()
c_retweets

Unnamed: 0,PeriodId,NumRetweets,CrossStanceRetweets
0,1,465,46
1,2,6173,861
2,3,11828,983
3,5,55548,1292
4,6,22901,2594


In [93]:
import numpy as np

# Add Total Retweets by Stance
a_retweets["TotalRetweets"] = np.array([36128, 421881, 341173, 1654295, 450234])
c_retweets["TotalRetweets"] = np.array([845, 15000, 27087, 88421, 326943])
a_retweets["PctCrossStanceIndividually"] = a_retweets.CrossStanceRetweets / a_retweets.NumRetweets
c_retweets["PctCrossStanceIndividually"] = c_retweets.CrossStanceRetweets / c_retweets.NumRetweets
a_retweets["PctCrossStanceGlobally"] = a_retweets.CrossStanceRetweets / a_retweets.TotalRetweets
c_retweets["PctCrossStanceGlobally"] = c_retweets.CrossStanceRetweets / c_retweets.TotalRetweets
a_retweets

Unnamed: 0,PeriodId,NumRetweets,CrossStanceRetweets,TotalRetweets,PctCrossStanceIndividually,PctCrossStanceGlobally
0,1,507,24,36128,0.047337,0.000664
1,2,49315,844,421881,0.017114,0.002001
2,3,194715,1124,341173,0.005773,0.003295
3,5,10950,2286,1654295,0.208767,0.001382
4,6,75415,3183,450234,0.042206,0.00707


In [94]:
c_retweets

Unnamed: 0,PeriodId,NumRetweets,CrossStanceRetweets,TotalRetweets,PctCrossStanceIndividually,PctCrossStanceGlobally
0,1,465,46,845,0.098925,0.054438
1,2,6173,861,15000,0.139478,0.0574
2,3,11828,983,27087,0.083108,0.03629
3,5,55548,1292,88421,0.023259,0.014612
4,6,22901,2594,326943,0.11327,0.007934
