In [1]:
%load_ext autoreload
%autoreload 2
import sys
from mongo_helper_functions import *
from encoding_cascade_functions import *
import twitter_cascade_reconstruction as pnnl
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
%matplotlib inline

# Twitter cascades are different from Reddit

On Reddit, we have 2 action types, separated in 2 distinct tables:
* post (submission)
* comment
The entire cascade is known and connected, from the comments leaves up to the root post.

On Twitter, we have 4 action types all mixed in a single table:
* tweet (root, equivalent to a post on Reddit): no root_id or parent_id
* reply (a direct comment to other tweet, reply, or quote, not necessarelly the root): has parent_id, but no root_id
* retweet of a tweet (an edge on the cascade structure, always pointing to the root; need additional algorithm to infere parent): root_id
* quote of a tweet (similar to retweet): root_id, while parent_id will be inferred
* quote of a reply (similar to a general reply): has parent_id, but no root_id
* quote of a retweet: NA, since user only see a reply or a tweet (even if it's a retweet)
* retweet of a quote or reply (similar to a general reply): has parent_id, but no root_id

PNNL code to reconstruct cascade reads the following format:

`{"rootID": "?", "actionType": "reply", "parentID": "A", "nodeTime": "2017-08-15T00:00:03Z", "nodeUserID": "d", "nodeID": "D"}`

In [2]:
nodeID_dict = {'?':'?'}

In [3]:
%%time
embeded_quote_df = query(
    connectMongo('twitter_cve'),
    project(
        match(
#             limit(1000),
            quoted_status={'$ne':None}
        ),
        is_reply = '$quoted_status.in_reply_to_status_id_str_h',
        created_at = '$quoted_status.created_at',
        nodeUserID = '$quoted_status.user.id_str_h',
        nodeID = '$quoted_status.id_str_h',
        text = '$quoted_status.text_m',
    )
)
embeded_quote_df = embeded_quote_df.drop_duplicates()
embeded_quote_df['is_quote']=False
embeded_quote_df['is_retweet']=False
embeded_quote_df['created_at'] = pd.to_datetime(embeded_quote_df.created_at)
embeded_quote_df['nodeTime'] = embeded_quote_df.created_at.apply(
    lambda x: int(x.timestamp()*1000)
)
# quoted_dict = {'?':'?'}
embeded_quote_df = pd.concat(
    [
        embeded_quote_df[['nodeID','nodeTime','nodeUserID','text','created_at']], 
        embeded_quote_df.apply(
            checkActionType, 
            axis=1, 
            data=embeded_quote_df,
            id_dict=nodeID_dict #quoted_dict
        )
    ], 
    axis=1
)
embeded_quote_df.info()

INFO:root:Query duration: 0:00:00.686481


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3877 entries, 0 to 5391
Data columns (total 9 columns):
nodeID             3877 non-null object
nodeTime           3877 non-null int64
nodeUserID         3877 non-null object
text               3877 non-null object
created_at         3877 non-null datetime64[ns]
actionType         3877 non-null object
parentID           3877 non-null object
rootID             3877 non-null object
provisoryParent    0 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 302.9+ KB
CPU times: user 4.17 s, sys: 17 ms, total: 4.19 s
Wall time: 4.54 s


In [4]:
%%time
embeded_retweeted_quote_df = query(
    connectMongo('twitter_cve'),
    project(
        match(
#             limit(1000),
            **{'retweeted_status.quoted_status':{'$ne':None}}
        ),
        is_reply = '$retweeted_status.quoted_status.in_reply_to_status_id_str_h',
        created_at = '$retweeted_status.quoted_status.created_at',
        nodeUserID = '$retweeted_status.quoted_status.user.id_str_h',
        nodeID = '$retweeted_status.quoted_status.id_str_h',
        text = '$retweeted_status.quoted_status.text_m',
    )
)
embeded_retweeted_quote_df = embeded_retweeted_quote_df.drop_duplicates()
embeded_retweeted_quote_df['is_quote']=False
embeded_retweeted_quote_df['is_retweet']=False
embeded_retweeted_quote_df['created_at'] = pd.to_datetime(embeded_retweeted_quote_df.created_at)
embeded_retweeted_quote_df['nodeTime'] = embeded_retweeted_quote_df.created_at.apply(
    lambda x: int(x.timestamp()*1000)
)
# quoted_dict = {'?':'?'}
embeded_retweeted_quote_df = pd.concat(
    [
        embeded_retweeted_quote_df[['nodeID','nodeTime','nodeUserID','text','created_at']], 
        embeded_retweeted_quote_df.apply(
            checkActionType, 
            axis=1, 
            data=embeded_retweeted_quote_df,
            id_dict=nodeID_dict #quoted_dict
        )
    ], 
    axis=1
)
embeded_retweeted_quote_df.info()

INFO:root:Query duration: 0:00:00.147663


<class 'pandas.core.frame.DataFrame'>
Int64Index: 377 entries, 0 to 672
Data columns (total 9 columns):
nodeID             377 non-null object
nodeTime           377 non-null int64
nodeUserID         377 non-null object
text               377 non-null object
created_at         377 non-null datetime64[ns]
actionType         377 non-null object
parentID           377 non-null object
rootID             377 non-null object
provisoryParent    0 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 29.5+ KB
CPU times: user 442 ms, sys: 3 ms, total: 445 ms
Wall time: 573 ms


In [5]:
%%time
embeded_retweet_df = query(
    connectMongo('twitter_cve'),
    project(
        match(
#             limit(1000),
            retweeted_status={'$ne':None}
        ),
        is_reply = '$retweeted_status.in_reply_to_status_id_str_h',
        is_quote = '$retweeted_status.quoted_status.id_str_h',
        is_quote_of_reply = '$retweeted_status.quoted_status.in_reply_to_status_id_str_h',
        is_quote_of_quote = '$retweeted_status.quoted_status.is_quote_status',
        created_at = '$retweeted_status.created_at',
        nodeUserID = '$retweeted_status.user.id_str_h',
        nodeID = '$retweeted_status.id_str_h',
        text = '$retweeted_status.text_m',
    )
)
embeded_retweet_df = embeded_retweet_df.drop_duplicates()
embeded_retweet_df['is_retweet']=False
embeded_retweet_df['created_at'] = pd.to_datetime(embeded_retweet_df.created_at)
embeded_retweet_df['nodeTime'] = embeded_retweet_df.created_at.apply(
    lambda x: int(x.timestamp()*1000)
)
embeded_retweet_df = embeded_retweet_df.replace(np.NaN,'')

# retweeted_dict = {'?':'?'}
embeded_retweet_df = pd.concat(
    [
        embeded_retweet_df[['nodeID','nodeTime','nodeUserID','text','created_at']], 
        embeded_retweet_df.apply(
            checkActionType, 
            axis=1, 
            data=embeded_retweet_df,
            id_dict=nodeID_dict #retweeted_dict
        )
    ], 
    axis=1
)
embeded_retweet_df.info()

INFO:root:Query duration: 0:00:01.432522


<class 'pandas.core.frame.DataFrame'>
Int64Index: 9644 entries, 0 to 19793
Data columns (total 9 columns):
nodeID             9644 non-null object
nodeTime           9644 non-null int64
nodeUserID         9644 non-null object
text               9644 non-null object
created_at         9644 non-null datetime64[ns]
actionType         9644 non-null object
parentID           9644 non-null object
rootID             9644 non-null object
provisoryParent    1 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 753.4+ KB
CPU times: user 10.4 s, sys: 60 ms, total: 10.5 s
Wall time: 11 s


In [6]:
%%time
cascade_collection_df = query(
    connectMongo('twitter_cve'),
    project(
#         limit(10000),
        is_reply = '$in_reply_to_status_id_str_h',
        is_retweet = '$retweeted_status.id_str_h',
        is_retweet_of_reply = '$retweeted_status.in_reply_to_status_id_str_h',
        is_retweet_of_quote = '$retweeted_status.quoted_status.id_str_h',
        is_quote = '$quoted_status.id_str_h',
        is_quote_of_reply = '$quoted_status.in_reply_to_status_id_str_h',
        is_quote_of_quote = '$quoted_status.is_quote_status',
        nodeTime = '$timestamp_ms',
        created_at = 1,
        nodeUserID = '$user.id_str_h',
        nodeID = '$id_str_h',
        text = '$text_m',
    )
)
print('#unique users',cascade_collection_df.nodeUserID.nunique())
cascade_collection_df['created_at'] = pd.to_datetime(cascade_collection_df.created_at)
cascade_collection_df.nodeTime = pd.to_numeric(cascade_collection_df.nodeTime)
cascade_collection_df.info()
cascade_collection_df = cascade_collection_df.replace(np.NaN,'')

cascade_collection_df = pd.concat(
    [
        cascade_collection_df[['nodeID','nodeTime','nodeUserID','text','created_at']], 
        cascade_collection_df.apply(
            checkActionType, 
            axis=1, 
            data=cascade_collection_df,
            id_dict=nodeID_dict
        )
    ], 
    axis=1
)
cascade_collection_df.info()

INFO:root:Query duration: 0:00:03.427954


#unique users 5881
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74074 entries, 0 to 74073
Data columns (total 12 columns):
created_at             74074 non-null datetime64[ns]
is_quote               5400 non-null object
is_quote_of_quote      5400 non-null object
is_quote_of_reply      5400 non-null object
is_reply               74074 non-null object
is_retweet             19795 non-null object
is_retweet_of_quote    674 non-null object
is_retweet_of_reply    19795 non-null object
nodeID                 74074 non-null object
nodeTime               74074 non-null int64
nodeUserID             74074 non-null object
text                   74074 non-null object
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 6.8+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74074 entries, 0 to 74073
Data columns (total 9 columns):
nodeID             74074 non-null object
nodeTime           74074 non-null int64
nodeUserID         74074 non-null object
text               74074 non-n

In [7]:
%%time
# join embeded objects in original tweet collection
cascade_collection_df = pd.concat(
    [
        cascade_collection_df, # 1st level tweets retrieved from json
        embeded_retweet_df,
        embeded_quote_df,
        embeded_retweeted_quote_df,
    ]
).drop_duplicates(subset='nodeID')
cascade_collection_df.info()
cascade_collection_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74756 entries, 0 to 249
Data columns (total 9 columns):
nodeID             74756 non-null object
nodeTime           74756 non-null int64
nodeUserID         74756 non-null object
text               74756 non-null object
created_at         74756 non-null datetime64[ns]
actionType         74756 non-null object
parentID           74756 non-null object
rootID             74756 non-null object
provisoryParent    733 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 5.7+ MB
CPU times: user 167 ms, sys: 6 ms, total: 173 ms
Wall time: 170 ms


In [8]:
# adding prefixes to ids
for id_col in ['nodeID','rootID','parentID','provisoryParent']:
    cascade_collection_df[id_col] = cascade_collection_df[id_col].apply(nodeID_dict.get)
cascade_collection_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74756 entries, 0 to 249
Data columns (total 9 columns):
nodeID             74756 non-null object
nodeTime           74756 non-null int64
nodeUserID         74756 non-null object
text               74756 non-null object
created_at         74756 non-null datetime64[ns]
actionType         74756 non-null object
parentID           74722 non-null object
rootID             74756 non-null object
provisoryParent    733 non-null object
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 5.7+ MB


In [9]:
cascade_collection_df.actionType.value_counts()

tweet                        50180
retweet                      19066
quote                         4731
retweet_of_quote               637
retweet_of_reply                55
reply                           45
retweet_of_quote_of_reply       37
quote_of_reply                   3
reply_of_quote                   1
quote_of_quote                   1
Name: actionType, dtype: int64

In [10]:
# missing parentID on database
cascade_collection_df[cascade_collection_df.parentID.isnull()].actionType.value_counts()

reply             33
reply_of_quote     1
Name: actionType, dtype: int64

In [11]:
# rootID profiles
cascade_collection_df.rootID.apply(lambda x: x[:3]).value_counts()

t3_    73977
?        779
Name: rootID, dtype: int64

# Use PNNL code to reconstruct cascades

In [12]:
#create followers dictionary with user IDs as keys and list of followers as values
followers = pd.read_json('data/cve_tng_follower_by_id_updated.json',lines=True)
followers['followers_id_h'] = followers.followers_id_h.apply(lambda x: set(x))
followers['num_followers'] = followers.followers_id_h.apply(lambda x: len(x))
followers = followers[followers.num_followers>0].groupby('user_id_h').max()
followers.info()
followers = followers.followers_id_h.to_dict()

<class 'pandas.core.frame.DataFrame'>
Index: 4194 entries, --ym14WNicu5losAWv2NPg to zzsDQYK7ZgF8z9Lb0KMGtQ
Data columns (total 2 columns):
followers_id_h    4194 non-null object
num_followers     4194 non-null int64
dtypes: int64(1), object(1)
memory usage: 98.3+ KB


In [13]:
# originaly, only replies, but other compound actions too
cascades_missing_root = cascade_collection_df[
    (cascade_collection_df.rootID=='?')
    & (cascade_collection_df.provisoryParent.isnull())
]
cascades_missing_root.actionType.unique()

array(['reply', 'reply_of_quote'], dtype=object)

In [14]:
#get the user who posted the provisory parent tweet for each compound action
provisory_users = cascade_collection_df[['nodeID','nodeUserID','nodeTime']]
provisory_users.columns = ['provisoryParent','provisoryUserID','provisoryTime']
cascade_collection_df = cascade_collection_df.merge(provisory_users,on='provisoryParent',how='left')

In [15]:
# intermediary parents for compound actions (retweets and quotes)
# limit data to events where the provisory parent is also contained in the data
cascades_provisory_parent = cascade_collection_df[
    (~cascade_collection_df.provisoryParent.isnull())
    & (cascade_collection_df.provisoryParent.isin(cascade_collection_df['nodeID']))
]
cascades_provisory_parent.actionType.unique()

array(['retweet_of_reply', 'retweet_of_quote',
       'retweet_of_quote_of_reply', 'quote_of_quote', 'quote_of_reply'],
      dtype=object)

In [16]:
#get parent IDs for provisory parents
pia = pnnl.ParentIDApproximation(
    followers, 
    cascades_provisory_parent,
    rootID_col_name= 'provisoryParent', 
    root_userID_col_name= 'provisoryUserID', 
    root_nodeTime_col_name= 'provisoryTime'
)
parent_ids = pia.get_approximate_parentids()

In [17]:
# if the approximation algorithm couldn't find a better parent, use the provisory
parent_ids = parent_ids.merge(
    cascades_provisory_parent[
        ['nodeID','provisoryParent']
    ], 
    on='nodeID',
    how='outer'
).replace(np.nan,'').apply(
    lambda x:
        pd.Series({
            'nodeID': x.nodeID,
            'parentID': x.parentID if x.parentID else x.provisoryParent,
        }),
        axis=1
)

In [18]:
cascades_provisory_parent['parentID'] = cascades_provisory_parent['nodeID'].map(dict(zip(parent_ids.nodeID,parent_ids.parentID)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [19]:
#limit data to events where the rootID is also contained in the data
print('#cascades before prune: ',len(cascade_collection_df))
cascade_collection_df = cascade_collection_df[cascade_collection_df['rootID'].isin(cascade_collection_df['nodeID'])]
print('#cascades after prune: ',len(cascade_collection_df))
cascade_collection_df.actionType.unique()

#cascades before prune:  74756
#cascades after prune:  73977


array(['tweet', 'retweet', 'quote'], dtype=object)

In [20]:
#get the user who posted the root tweet for each retweet
root_users = cascade_collection_df[['nodeID','nodeUserID','nodeTime']]
root_users.columns = ['rootID','rootUserID','rootTime']
cascade_collection_df = cascade_collection_df.merge(root_users,on='rootID',how='left')

#store original tweets for later
original_tweets = cascade_collection_df[cascade_collection_df['actionType'] == 'tweet']

In [21]:
#subset on only retweets and quotes
cascade_collection_df = cascade_collection_df[cascade_collection_df['actionType'].isin(['retweet','quote'])]
cascade_collection_df_retweets = cascade_collection_df[['nodeID','nodeUserID','nodeTime','rootID','rootUserID','rootTime']]

In [22]:
#get parent IDs for retweets and quotes
pia = pnnl.ParentIDApproximation(followers, cascade_collection_df_retweets)
parent_ids = pia.get_approximate_parentids()

In [23]:
cascade_collection_df['parentID'] = cascade_collection_df['nodeID'].map(dict(zip(parent_ids.nodeID,parent_ids.parentID)))

In [24]:
print(cascade_collection_df['parentID'].isna().sum())
cascade_collection_df.loc[
    cascade_collection_df['parentID'].isna(),
    'parentID'
] = cascade_collection_df.loc[cascade_collection_df['parentID'].isna(),'rootID']
print(cascade_collection_df['parentID'].isna().sum())

16252
0


In [25]:
#rejoin with replies and original tweets
cascade_collection_df = pd.concat(
    [
        cascade_collection_df,
        cascades_provisory_parent,
        cascades_missing_root,
        original_tweets
    ],
    sort=False,
    axis=0
).sort_values('nodeTime')
cascade_collection_df = cascade_collection_df.drop([
    'rootUserID',
    'rootTime',
    'provisoryParent',
    'provisoryUserID',
    'provisoryTime'
],axis=1)

In [26]:
#follow cascade chain to get root node for reply tweets
cascade_collection_df = pnnl.get_reply_cascade_root_tweet(cascade_collection_df)

In [27]:
cascade_collection_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74756 entries, 73430 to 31330
Data columns (total 8 columns):
nodeID        74756 non-null object
nodeTime      74756 non-null int64
nodeUserID    74756 non-null object
text          74756 non-null object
created_at    74756 non-null datetime64[ns]
actionType    74756 non-null object
parentID      74722 non-null object
rootID        74639 non-null object
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 5.1+ MB


In [28]:
cascade_collection_df.actionType.value_counts()

tweet                        50180
retweet                      19066
quote                         4731
retweet_of_quote               637
retweet_of_reply                55
reply                           45
retweet_of_quote_of_reply       37
quote_of_reply                   3
reply_of_quote                   1
quote_of_quote                   1
Name: actionType, dtype: int64

In [29]:
cascade_collection_df.dropna().actionType.value_counts()

tweet               50180
retweet             19066
quote                4731
retweet_of_quote      637
reply                  12
retweet_of_reply       12
quote_of_quote          1
Name: actionType, dtype: int64

In [30]:
cascade_collection_df.groupby('rootID').size().value_counts()

1      38102
2       7329
3       2768
4        979
5        414
6        189
7        112
8         68
9         42
10        26
12        12
15        12
13        12
11        11
14         9
20         9
22         7
18         6
27         5
16         5
17         4
23         4
19         4
51         3
44         3
26         3
28         2
34         2
25         2
56         2
29         2
35         2
21         2
36         2
39         2
37         1
67         1
70         1
166        1
65         1
71         1
30         1
167        1
212        1
60         1
187        1
59         1
57         1
55         1
54         1
49         1
40         1
48         1
79         1
78         1
41         1
104        1
72         1
32         1
dtype: int64

In [31]:
tmp = cascade_collection_df.groupby('rootID').agg({
    'parentID': {'parent_unique':'nunique','parent_count':'count'},
    'actionType': {'actionType_unique':'unique','actionType_nunique':'nunique'},
#     'nodeTime': 'count'
})
tmp.columns = tmp.columns.droplevel()
tmp.sort_values(['actionType_nunique','parent_unique'],ascending=False).head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


Unnamed: 0_level_0,parent_unique,parent_count,actionType_unique,actionType_nunique
rootID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
t3_omU5q33R1zMMr6nsR27m0A,2,6,"[tweet, quote, retweet, reply, retweet_of_reply]",5
t3_lOeBONFGyIHdU45SEnm37g,51,166,"[tweet, retweet, quote, retweet_of_quote]",4
t3_5XUi3MUBbp35TAOoK2mRzg,16,70,"[tweet, retweet, quote, retweet_of_quote]",4
t3_Yefp6lGP9m76lqxf-yJcdg,16,60,"[tweet, retweet, quote, retweet_of_quote]",4
t3_LKiYvNCItB6OTjqeS9T2WA,7,18,"[tweet, retweet, quote, retweet_of_quote]",4
t3_XT0VywYhMXDD1MzvyrIMDg,6,9,"[tweet, retweet, quote, retweet_of_quote]",4
t3_2W0O8QP6R8Q5zzS9BDHsyQ,4,16,"[tweet, retweet, reply, retweet_of_reply]",4
t3_0nThPbEqugQnxgBgNAUGqA,3,15,"[tweet, retweet, quote, retweet_of_quote]",4
t3_AKVkIfWNHWqpo-hZSLc7Bg,3,6,"[tweet, retweet, quote, retweet_of_quote]",4
t3_E2T0fENdQGJRptgWW8kKrQ,3,16,"[tweet, retweet, quote, retweet_of_quote]",4


In [33]:
# missing parentID on database
print(
    'missing parentID on database\n',
    cascade_collection_df[cascade_collection_df.parentID.isnull()].actionType.value_counts()
)
# rootID profiles
print(
    'missing rootID on database\n',
    cascade_collection_df[cascade_collection_df.rootID.isnull()].actionType.value_counts()
)

missing parentID on database
 reply             33
reply_of_quote     1
Name: actionType, dtype: int64
missing rootID on database
 retweet_of_reply             43
retweet_of_quote_of_reply    37
reply                        33
quote_of_reply                3
reply_of_quote                1
Name: actionType, dtype: int64


# Creating Cascade Summary

In [34]:
cascade_collection_df.info()
cascade_collection_df.head(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74756 entries, 73430 to 31330
Data columns (total 8 columns):
nodeID        74756 non-null object
nodeTime      74756 non-null int64
nodeUserID    74756 non-null object
text          74756 non-null object
created_at    74756 non-null datetime64[ns]
actionType    74756 non-null object
parentID      74722 non-null object
rootID        74639 non-null object
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 7.6+ MB


Unnamed: 0,nodeID,nodeTime,nodeUserID,text,created_at,actionType,parentID,rootID
73430,t3_WizZXDbXaswt1t7rZQhg0g,1342618567000,xpeDfmot9WC72azxyT4Qow,We have full code execution against win7/IE8 w...,2012-07-18 13:36:07,tweet,t3_WizZXDbXaswt1t7rZQhg0g,t3_WizZXDbXaswt1t7rZQhg0g
73503,t3_PDqJ_QcF53HvKLrUVGmtMg,1378813871000,qpx6JAKu6ybBHvZknyQVWw,Neutrino exploit kit now also serves @un: _G5F...,2013-09-10 11:51:11,tweet,t3_PDqJ_QcF53HvKLrUVGmtMg,t3_PDqJ_QcF53HvKLrUVGmtMg
73442,t3_YmhVXpn31xwOz9feSVie9w,1414102755000,0F5i4lFEd4ffX1fgE4CsjQ,CVE-2014-6352 OLE Remote Code Execution Vulner...,2014-10-23 22:19:15,tweet,t3_YmhVXpn31xwOz9feSVie9w,t3_YmhVXpn31xwOz9feSVie9w


In [35]:
cascade_collection_df.rootID.nunique() #number of cascades

50180

In [36]:
%time cascade_summary = cascade_collection_df.groupby(\
    ['rootID']\
).apply(generateCascadeSummary)
cascade_summary = cascade_summary.reset_index(drop=True).sort_values('root_time')
cascade_summary['root_id'] = cascade_summary.root_id.apply(lambda x: x[3:])
cascade_summary.info()
cascade_summary.head()

CPU times: user 1min 30s, sys: 217 ms, total: 1min 30s
Wall time: 1min 30s
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50180 entries, 26235 to 28222
Data columns (total 13 columns):
root_id             50180 non-null object
root_time           50180 non-null int64
root_user           50180 non-null object
num_comments        50180 non-null int64
num_users           50180 non-null int64
thread_node_id      50180 non-null object
thread_parent       50180 non-null object
thread_user         50180 non-null object
thread_time         50180 non-null object
thread_time_diff    50180 non-null object
title               50180 non-null object
all_text            50180 non-null object
thread_action       50180 non-null object
dtypes: int64(3), object(10)
memory usage: 5.4+ MB


Unnamed: 0,root_id,root_time,root_user,num_comments,num_users,thread_node_id,thread_parent,thread_user,thread_time,thread_time_diff,title,all_text,thread_action
26235,WizZXDbXaswt1t7rZQhg0g,1342618567000,xpeDfmot9WC72azxyT4Qow,1,1,[t1_Q5EY25Hj8dWPd4prWDCvaw],[t3_WizZXDbXaswt1t7rZQhg0g],[q5gVSIW-a4E4rCcQ9lUbMw],[1494799356391],[152180789391],We have full code execution against win7/IE8 w...,We have full code execution against win7/IE8 w...,[retweet]
20405,PDqJ_QcF53HvKLrUVGmtMg,1378813871000,qpx6JAKu6ybBHvZknyQVWw,1,1,[t1_bo4OS8GaoEJMUSbPOUUSCQ],[t3_PDqJ_QcF53HvKLrUVGmtMg],[CJw2OGi_QSNPldndF4iGiA],[1476056502763],[97242631763],Neutrino exploit kit now also serves @un: _G5F...,Neutrino exploit kit now also serves @un: _G5F...,[retweet]
27871,YmhVXpn31xwOz9feSVie9w,1414102755000,0F5i4lFEd4ffX1fgE4CsjQ,1,1,[t1_nHaIFd-kWHwtmZSocO2oMQ],[t3_YmhVXpn31xwOz9feSVie9w],[Ohdx-KNa7tL5Jp9SIHlioQ],[1500593057867],[86490302867],CVE-2014-6352 OLE Remote Code Execution Vulner...,CVE-2014-6352 OLE Remote Code Execution Vulner...,[retweet]
13159,FygzkwPiSdbyVmfzmoPCfw,1418719690000,zOfdbmzmQ1fYGRWTuAFPZg,1,1,[t1_qyUCTiXdaM0xPLhXJog3Pw],[t3_FygzkwPiSdbyVmfzmoPCfw],[v28KY0hVCH9s7yc3Jy7L5w],[1469320546507],[50600856507],#cybersecurity Vuln: Ruby on Rails CVE-2013-02...,#cybersecurity Vuln: Ruby on Rails CVE-2013-02...,[retweet]
30407,asQO5pX8o2O4OYRa7wXKfA,1419080520000,ZXXfm6kCwwAorO7HQjE4MA,1,1,[t1_5IIZTa33Nz9nIfLWV3ii2g],[t3_asQO5pX8o2O4OYRa7wXKfA],[cmByCa0pJmYlkcuuxR4q7A],[1479743272938],[60662752938],CVE-2014-9390 - url: https://t.co/KWGO4M_35Rk...,CVE-2014-9390 - url: https://t.co/KWGO4M_35Rk...,[retweet]


In [37]:
%%time
new_collection = 'twitter_cve_cascade'
# inserting encoded cascades in MongoDB
res = connectMongo(
    new_collection
).insert_many(
    json.loads(
        cascade_summary.T.to_json()
    ).values()
)
connectMongo(new_collection).create_index('root_id', unique=True)

CPU times: user 2.73 s, sys: 102 ms, total: 2.84 s
Wall time: 3.98 s


## Adding network properties

In [38]:
df = cascade_summary[cascade_summary.num_comments>0].apply(
    get_cascade_edge_df,
    axis=1
)

In [39]:
df.head()

Unnamed: 0,root_id,nodes,parents,edges
26235,WizZXDbXaswt1t7rZQhg0g,[t1_Q5EY25Hj8dWPd4prWDCvaw],[t3_WizZXDbXaswt1t7rZQhg0g],"[(t1_Q5EY25Hj8dWPd4prWDCvaw, t3_WizZXDbXaswt1t..."
20405,PDqJ_QcF53HvKLrUVGmtMg,[t1_bo4OS8GaoEJMUSbPOUUSCQ],[t3_PDqJ_QcF53HvKLrUVGmtMg],"[(t1_bo4OS8GaoEJMUSbPOUUSCQ, t3_PDqJ_QcF53HvKL..."
27871,YmhVXpn31xwOz9feSVie9w,[t1_nHaIFd-kWHwtmZSocO2oMQ],[t3_YmhVXpn31xwOz9feSVie9w],"[(t1_nHaIFd-kWHwtmZSocO2oMQ, t3_YmhVXpn31xwOz9..."
13159,FygzkwPiSdbyVmfzmoPCfw,[t1_qyUCTiXdaM0xPLhXJog3Pw],[t3_FygzkwPiSdbyVmfzmoPCfw],"[(t1_qyUCTiXdaM0xPLhXJog3Pw, t3_FygzkwPiSdbyVm..."
30407,asQO5pX8o2O4OYRa7wXKfA,[t1_5IIZTa33Nz9nIfLWV3ii2g],[t3_asQO5pX8o2O4OYRa7wXKfA],"[(t1_5IIZTa33Nz9nIfLWV3ii2g, t3_asQO5pX8o2O4OY..."


In [40]:
%%time
df = pd.concat([df, df.apply(extractFeatures,axis=1)], axis=1)

CPU times: user 8.28 s, sys: 18 ms, total: 8.3 s
Wall time: 8.28 s


In [41]:
tmp = df.apply(convertNodeIds, axis=1)
del df['edges']
%time df = pd.concat([df, tmp], axis=1)
del tmp
df.head()

  np.subtract(source,1)


CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 4.03 ms


Unnamed: 0,root_id,nodes,parents,depth,depth_max,breadth,breadth_max,edges,edges_dist,no_time_problem
26235,WizZXDbXaswt1t7rZQhg0g,[t1_Q5EY25Hj8dWPd4prWDCvaw],[t3_WizZXDbXaswt1t7rZQhg0g],[1],1,[1],1,"[[1, 0]]",[nan],True
20405,PDqJ_QcF53HvKLrUVGmtMg,[t1_bo4OS8GaoEJMUSbPOUUSCQ],[t3_PDqJ_QcF53HvKLrUVGmtMg],[1],1,[1],1,"[[1, 0]]",[nan],True
27871,YmhVXpn31xwOz9feSVie9w,[t1_nHaIFd-kWHwtmZSocO2oMQ],[t3_YmhVXpn31xwOz9feSVie9w],[1],1,[1],1,"[[1, 0]]",[nan],True
13159,FygzkwPiSdbyVmfzmoPCfw,[t1_qyUCTiXdaM0xPLhXJog3Pw],[t3_FygzkwPiSdbyVmfzmoPCfw],[1],1,[1],1,"[[1, 0]]",[nan],True
30407,asQO5pX8o2O4OYRa7wXKfA,[t1_5IIZTa33Nz9nIfLWV3ii2g],[t3_asQO5pX8o2O4OYRa7wXKfA],[1],1,[1],1,"[[1, 0]]",[nan],True


In [42]:
res=updateCollectionFromDataFrame(
    collection=connectMongo(new_collection), 
    df=df, 
    bulk_func=prepareBulkUpdate, 
    find_field='root_id', 
    update_fields=['depth','depth_max','breadth','breadth_max','edges','edges_dist','no_time_problem'], 
    upsert=False
)
res.bulk_api_result

INFO:root:Update duration: 0:00:03.958879


{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 12078,
 'nModified': 12078,
 'nRemoved': 0,
 'upserted': []}

## Adding user (submitter) features

In [43]:
df = query(
    connectMongo(new_collection), 
    project(
#         limit(500),
        root_id=1,
        root_user=1,
        first_level_comments={'$arrayElemAt': ['$breadth', 0]},
        breadth_max=1,
        depth_max=1,
        edges_dist=1,
        subreddit=1,
        size='$num_comments',
        lifetime={'$arrayElemAt': ['$thread_time_diff', -1]}
    )
)
df = df[
    (df.root_user != '[Deleted]') & 
    (~df.root_user.isna())
]
df = pd.concat(
    [
        df,
        df.edges_dist.apply(calculate_recent_root_distances).fillna(0)
    ],
    axis=1
)
del df['edges_dist']
df.info()
df.head()

INFO:root:Query duration: 0:00:02.460087


<class 'pandas.core.frame.DataFrame'>
Int64Index: 50180 entries, 0 to 50179
Data columns (total 9 columns):
breadth_max             12078 non-null float64
depth_max               12078 non-null float64
first_level_comments    12078 non-null float64
lifetime                12078 non-null float64
root_id                 50180 non-null object
root_user               50180 non-null object
size                    50180 non-null int64
recent_edge             50180 non-null float64
root_edge               50180 non-null float64
dtypes: float64(6), int64(1), object(2)
memory usage: 3.8+ MB


Unnamed: 0,breadth_max,depth_max,first_level_comments,lifetime,root_id,root_user,size,recent_edge,root_edge
0,1.0,1.0,1.0,152180800000.0,WizZXDbXaswt1t7rZQhg0g,xpeDfmot9WC72azxyT4Qow,1,0.0,0.0
1,1.0,1.0,1.0,97242630000.0,PDqJ_QcF53HvKLrUVGmtMg,qpx6JAKu6ybBHvZknyQVWw,1,0.0,0.0
2,1.0,1.0,1.0,86490300000.0,YmhVXpn31xwOz9feSVie9w,0F5i4lFEd4ffX1fgE4CsjQ,1,0.0,0.0
3,1.0,1.0,1.0,50600860000.0,FygzkwPiSdbyVmfzmoPCfw,zOfdbmzmQ1fYGRWTuAFPZg,1,0.0,0.0
4,1.0,1.0,1.0,60662750000.0,asQO5pX8o2O4OYRa7wXKfA,ZXXfm6kCwwAorO7HQjE4MA,1,0.0,0.0


In [44]:
basic_func = ['mean','median']

user_stats=df.groupby('root_user').agg({
    'breadth_max': basic_func,
    'depth_max': basic_func,
    'first_level_comments': basic_func,
    'lifetime': basic_func,
    'size': basic_func + ['sum'],
    'recent_edge': basic_func,
    'root_edge': basic_func,
    'root_id': ['count']
})

user_stats.columns = [
    'uf_{}_{}'.format(
        user_stats.columns.levels[0][user_stats.columns.labels[0][i]], 
        user_stats.columns.levels[1][user_stats.columns.labels[1][i]]
    ) 
    for i in range(len(user_stats.columns.labels[0]))
]
user_stats.head()

Unnamed: 0_level_0,uf_breadth_max_mean,uf_breadth_max_median,uf_depth_max_mean,uf_depth_max_median,uf_first_level_comments_mean,uf_first_level_comments_median,uf_lifetime_mean,uf_lifetime_median,uf_size_mean,uf_size_median,uf_size_sum,uf_recent_edge_mean,uf_recent_edge_median,uf_root_edge_mean,uf_root_edge_median,uf_root_id_count
root_user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
-3ThtBx1z_-yTFXq75H9jA,,,,,,,,,0.0,0.0,0,0.0,0.0,0.0,0.0,1
-3w5gVQi0Oq0stmTh2M_NQ,1.0,1.0,1.0,1.0,1.0,1.0,36246715.0,36246715.0,1.0,1.0,1,0.0,0.0,0.0,0.0,1
-A7E9AvwmOT_hnI87XaAQg,,,,,,,,,0.0,0.0,0,0.0,0.0,0.0,0.0,1
-DA0vRp-G5j5GGahtU4mKQ,2.0,2.0,1.0,1.0,2.0,2.0,196207.0,196207.0,2.0,2.0,2,0.0,0.0,1.0,1.0,1
-F2tFjvpoM3U0cmzp5XhuQ,,,,,,,,,0.0,0.0,0,0.0,0.0,0.0,0.0,1


In [45]:
print(len(df))
df = pd.concat(
    [
        df[['root_user','root_id']],
        df.transform(
            lambda x: pd.Series(user_stats.loc[x.root_user,:]), 
            axis=1
        )
    ],
    axis=1
)
df.info()
df.head()

50180
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50180 entries, 0 to 50179
Data columns (total 18 columns):
root_user                         50180 non-null object
root_id                           50180 non-null object
uf_breadth_max_mean               48603 non-null float64
uf_breadth_max_median             48603 non-null float64
uf_depth_max_mean                 48603 non-null float64
uf_depth_max_median               48603 non-null float64
uf_first_level_comments_mean      48603 non-null float64
uf_first_level_comments_median    48603 non-null float64
uf_lifetime_mean                  48603 non-null float64
uf_lifetime_median                48603 non-null float64
uf_size_mean                      50180 non-null float64
uf_size_median                    50180 non-null float64
uf_size_sum                       50180 non-null float64
uf_recent_edge_mean               50180 non-null float64
uf_recent_edge_median             50180 non-null float64
uf_root_edge_mean               

Unnamed: 0,root_user,root_id,uf_breadth_max_mean,uf_breadth_max_median,uf_depth_max_mean,uf_depth_max_median,uf_first_level_comments_mean,uf_first_level_comments_median,uf_lifetime_mean,uf_lifetime_median,uf_size_mean,uf_size_median,uf_size_sum,uf_recent_edge_mean,uf_recent_edge_median,uf_root_edge_mean,uf_root_edge_median,uf_root_id_count
0,xpeDfmot9WC72azxyT4Qow,WizZXDbXaswt1t7rZQhg0g,1.0,1.0,1.0,1.0,1.0,1.0,152180800000.0,152180800000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
1,qpx6JAKu6ybBHvZknyQVWw,PDqJ_QcF53HvKLrUVGmtMg,1.0,1.0,1.0,1.0,1.0,1.0,97242630000.0,97242630000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
2,0F5i4lFEd4ffX1fgE4CsjQ,YmhVXpn31xwOz9feSVie9w,1.0,1.0,1.0,1.0,1.0,1.0,86490300000.0,86490300000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
3,zOfdbmzmQ1fYGRWTuAFPZg,FygzkwPiSdbyVmfzmoPCfw,1.648674,1.0,1.06177,1.0,1.639399,1.0,24794870.0,394295.0,1.009148,1.0,9266.0,0.019919,0.0,0.230438,0.0,9182.0
4,ZXXfm6kCwwAorO7HQjE4MA,asQO5pX8o2O4OYRa7wXKfA,1.0,1.0,1.0,1.0,1.0,1.0,60662750000.0,60662750000.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0


In [46]:
updateCollectionFromDataFrame(
    collection=connectMongo(new_collection), 
    df=df, 
    bulk_func=prepareBulkUpdate, 
    find_field='root_id', 
    update_fields= [
        c for c in df.columns if c not in 
        ['root_user','root_id']
    ], 
    upsert=True
).bulk_api_result

INFO:root:Update duration: 0:00:22.578080


{'writeErrors': [],
 'writeConcernErrors': [],
 'nInserted': 0,
 'nUpserted': 0,
 'nMatched': 50180,
 'nModified': 50180,
 'nRemoved': 0,
 'upserted': []}

## Fixing time problems with node referencing nodes in the future

In [47]:
df = query(
    connectMongo(new_collection),
    project(
        match(
            no_time_problem=False
        ),
        root_id=1,
        thread_node_id=1,
        thread_parent=1,
        thread_user=1,
        edges=1,
    )
)
df.info()
df.head()

INFO:root:Query duration: 0:00:00.061652


<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [48]:
if len(df):
    df.apply(reverseInconsistentNodes, axis=1)
    res=updateCollectionFromDataFrame(
        collection=connectMongo(new_collection), 
        df=df, 
        bulk_func=prepareBulkUpdate, 
        find_field='root_id', 
        update_fields=fields_to_change, 
        upsert=False
    ).bulk_api_result