# Keyword searchs in tweet content

Using original (filtered) tweet databases collected during livestreamed BTS concerts in 2021, show search criteria and results for mentions of themes like covid. 

This notebook is documents searches on data that cannot be shared openly. Reduced depersonalised datasets are published elsewhere.

In [1]:
import sys
import os
import time
import datetime as dt
import math
import numpy as np 
import scipy as sp
import pandas as pd

In [2]:
# import respy functions from twt.py file
%load_ext autoreload
%autoreload 1
%aimport twt

In [3]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html

dtype_map = {'id': 'Int64', 'created_at':str, 'tweet':str, 'source':str, 'language':str, 'user_id': 'Int64',
       'user_screen_name':str, 'user_name':str, 'user_description':str, 'user_language':str,
       'user_location':str, 'user_created_at':str, 'user_followers_count': 'Int64',
       'user_friends_count': 'Int64', 'user_statuses_count': 'Int64', 'user_favorites_count': 'Int64',
       'user_verified':str, 'in_reply_to_status_id': 'Int64', 'in_reply_to_user_id': 'Int64',
       'in_reply_to_user_screen_name':str, 'retweeted_status_id': 'Int64',
       'retweeted_status_user_id': 'Int64', 'retweeted_status_user_screen_name':str,
       'retweeted_status_user_name':str, 'retweeted_status_user_description':str,
       'retweeted_status_user_friends_count': 'Int64',
       'retweeted_status_user_statuses_count': 'Int64',
       'retweeted_status_user_followers_count': 'Int64',
       'retweeted_status_retweet_count': 'Int64', 'retweeted_status_favorite_count': 'Int64',
       'retweeted_status_reply_count': 'Int64', 'quoted_status_id': 'Int64',
       'quoted_status_user_id': 'Int64', 'quoted_status_user_screen_name':str,
       'quoted_status_user_name':str, 'quoted_status_user_description':str,
       'quoted_status_user_friends_count': 'Int64', 'quoted_status_user_statuses_count': 'Int64',
       'quoted_status_user_followers_count': 'Int64', 'quoted_status_retweet_count': 'Int64',
       'quoted_status_favorite_count': 'Int64', 'quoted_status_reply_count': 'Int64'}

In [4]:
raw_dir = '/Users/finn/Desktop/Current_Projects/BTS_twitter/twt_Analysis/'
Concerts = pd.DataFrame(columns=['tag','raw_loc','fullfeilds_loc','dep_loc',
                                 'raw_twt_db','full_twt_db','fan_twt_db','dep_twt_db',
                                 'event_file','event_offset','event_reduction','Long_name','sampling','Program'])
Concerts.loc[0,:]={'tag': 'SWZ_D1','raw_loc':'data/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Fan_tweets_H_Sowoozoo_D1.csv','full_twt_db':'All_Tweets_SWZ_D1.csv',
             'fan_twt_db':'fan_Tweets_SWZ_D1.csv','dep_twt_db':'fan_Tweets_SWZ_D1_reduced.csv',
             'event_file':'Setlists_sowoozoo_D1.csv',
             'event_offset':'6MIN','event_reduction':[1,2,3,6,8,9,10,12,13,15,16,19,20,21,22,23,25,26,27,28],
             'Long_name':'Sowoozoo Concert Day 1','sampling':'#SOWOOZOO','Program':'SWZ'}
Concerts.loc[1,:]={'tag': 'SWZ_D2','raw_loc':'data/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Fan_tweets_H_Sowoozoo_D2.csv','full_twt_db':'All_Tweets_SWZ_D2.csv',
             'fan_twt_db':'fan_Tweets_SWZ_D2.csv','dep_twt_db':'fan_Tweets_SWZ_D2_reduced.csv',
             'event_file':'Setlists_sowoozoo_D2.csv',
             'event_offset':'108S','event_reduction':[1,2,3,6,8,9,10,12,13,15,16,19,20,21,22,23,25,26,27,28],
             'Long_name':'Sowoozoo Concert Day 2','sampling':'#SOWOOZOO','Program':'SWZ'}
Concerts.loc[2,:]={'tag': 'PTD_ON','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'FullPTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON.csv','dep_twt_db':'fan_Tweets_PTD_ON_reduced.csv',
             'event_file':'Setlists_PTD_ON.csv','event_offset':'25S','event_reduction':[1,2,3,6,7,8,9,11,12,14,15,17,18,21,22,28,29,32,33,34,36,37,38,39],
             'Long_name':'Permission to Dance on Stage','sampling':'Kpop Stream','Program':'PTD'}
Concerts.loc[3,:]={'tag': 'PTD_LA4','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'PTD_LA4_Fan_tweets_FULLSTREAM.csv','full_twt_db':'All_Tweets_PTD_LA4.csv',
             'fan_twt_db':'fan_Tweets_PTD_LA4.csv','dep_twt_db':'fan_Tweets_PTD_LA4_reduced.csv',
             'event_file':'Setlists_PTD_LA4.csv','event_offset':'40S','event_reduction':[1,2,3,6,7,8,9,11,12,14,15,17,18,21,22,28,29,32,33,34,36,37,38,39],
             'Long_name':'Permission to Dance LA Day 4','sampling':'Kpop Stream','Program':'PTD'}
Concerts.loc[4,:]={'tag': 'PTD_ON_Alt1','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Alt1PTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON_Alt1.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON_Alt1.csv','dep_twt_db':'fan_Tweets_PTD_ON_Alt1_reduced.csv',
             'event_file':'','event_offset':'','event_reduction':[],
             'Long_name':'Week prior to PTD On Stage','sampling':'Kpop Stream','Program':''}
Concerts.loc[5,:]={'tag': 'PTD_ON_Alt2','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Alt2PTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON_Alt2.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON_Alt2.csv','dep_twt_db':'fan_Tweets_PTD_ON_Alt2_reduced.csv',
             'event_file':'','event_offset':'','event_reduction':[],
             'Long_name':'Week following PTD On Stage','sampling':'Kpop Stream','Program':''}
Concerts

Unnamed: 0,tag,raw_loc,fullfeilds_loc,dep_loc,raw_twt_db,full_twt_db,fan_twt_db,dep_twt_db,event_file,event_offset,event_reduction,Long_name,sampling,Program
0,SWZ_D1,data/,../StreamData/,./data/,Fan_tweets_H_Sowoozoo_D1.csv,All_Tweets_SWZ_D1.csv,fan_Tweets_SWZ_D1.csv,fan_Tweets_SWZ_D1_reduced.csv,Setlists_sowoozoo_D1.csv,6MIN,"[1, 2, 3, 6, 8, 9, 10, 12, 13, 15, 16, 19, 20,...",Sowoozoo Concert Day 1,#SOWOOZOO,SWZ
1,SWZ_D2,data/,../StreamData/,./data/,Fan_tweets_H_Sowoozoo_D2.csv,All_Tweets_SWZ_D2.csv,fan_Tweets_SWZ_D2.csv,fan_Tweets_SWZ_D2_reduced.csv,Setlists_sowoozoo_D2.csv,108S,"[1, 2, 3, 6, 8, 9, 10, 12, 13, 15, 16, 19, 20,...",Sowoozoo Concert Day 2,#SOWOOZOO,SWZ
2,PTD_ON,data/PTD/,../StreamData/,./data/,FullPTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON.csv,fan_Tweets_PTD_ON.csv,fan_Tweets_PTD_ON_reduced.csv,Setlists_PTD_ON.csv,25S,"[1, 2, 3, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, ...",Permission to Dance on Stage,Kpop Stream,PTD
3,PTD_LA4,data/PTD/,../StreamData/,./data/,PTD_LA4_Fan_tweets_FULLSTREAM.csv,All_Tweets_PTD_LA4.csv,fan_Tweets_PTD_LA4.csv,fan_Tweets_PTD_LA4_reduced.csv,Setlists_PTD_LA4.csv,40S,"[1, 2, 3, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, ...",Permission to Dance LA Day 4,Kpop Stream,PTD
4,PTD_ON_Alt1,data/PTD/,../StreamData/,./data/,Alt1PTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON_Alt1.csv,fan_Tweets_PTD_ON_Alt1.csv,fan_Tweets_PTD_ON_Alt1_reduced.csv,,,[],Week prior to PTD On Stage,Kpop Stream,
5,PTD_ON_Alt2,data/PTD/,../StreamData/,./data/,Alt2PTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON_Alt2.csv,fan_Tweets_PTD_ON_Alt2.csv,fan_Tweets_PTD_ON_Alt2_reduced.csv,,,[],Week following PTD On Stage,Kpop Stream,


# Objections to being cited/used for research

The data collected was only of accounts that were public at the time of sampling, and by Twitters terms of service and the Twitter API license used by the university, it was legal to collect these status updates and statistics. However there have been times when twitter users have been very concerned about the way their data is collected and converted into produces for media and research, commercial and academic. 

Out of respect for these concerns, we have taken measures to protect the users whos data we have studied, taking many steps to mitigating risks of harmful visibility via the published datasets and reporting on this research. 

One way in which twitter users have tried to discourage (legal but unethical) citation and data collection is to mark their resistance in their twitter bios. This is a field that we did not use in our research but it is collected by the Twitter API, so we try to find instances in the material we have. 

In [17]:
i = 0
df_alltwt=pd.read_csv(raw_dir + Concerts.loc[i,'raw_loc'] + Concerts.loc[i,'raw_twt_db'],
                 lineterminator='\n',low_memory=False)
df_bios = df_alltwt.loc[df_alltwt['user_description'].notna(),:].copy()
twts = df_bios['user_description'].unique()
print([len(df_alltwt),len(df_bios),len(twts)])

twts[:5]

[225993, 189371, 90454]


array(['I purple you 💜💜',
       '99line/BTS/🐤ペン/推しカプ→→🐰🐥/🦄🐭/🐨🐹/🐯🐣/rps/20↑📛 交換垢→@miichim_koukan',
       'เราจะรวยเมื่อไหร่คะ #อิงอิงจะเเจก',
       'Association BTS France 🇫🇷 Fanbase française dédiée à @BTS_twt et au fandom ARMY 🐋💜 Back Up @ProjetBtsFrance | Site http://www.btsfrance.fr/',
       '❥\n𝐼𝑛𝑠𝑡𝑎𝑔𝑟𝑎𝑚 : 𝑏𝑏𝑏𝑏𝑏𝑏𝑜𝑤_𝑏\n💜💙❤🧡💚💛🖤'], dtype=object)

In [18]:
# example search
kw ='무단사용 및 도용금지'
twts = pd.Series(twts)
sub_twt = twts.loc[(twts.str.contains(kw, case=False,regex=False))]
for t in sub_twt:
    print(t+'\n')
    

7명 모두가 소듕한 아미입니다💜                                                                    주로 #어썸_작업 | 좋은날에 #어썸_이벵 |
후기는 #어썸한후기 | 무단사용 및 도용금지🙅‍♀️ | 나눔물품 금전거래 금지🚫



The above bio message is an example of usage restrictions expressed by twitter users in addition to the legal access granted by the API licensing agreement. In this case, the user requests that people do not repost the art they share as images in tweets. The API does not collect embedded images, only hyperlinks back to image locations, so our research use of this user's tweets is not in conflict with their expressed restrictions. 

A similar search was used to identify the 12 users who had made explicit requests in their bios to not have any tweet content used without explicit permission with text strings like: "🚫 please do not cite my tweets w/o my express consent" "💥This acct DOES NOT consent to being used for research purposes 💥" "DON'T USE MY Tweets/screenshot them without MY express consent!"

The ids of users identified were retained in the (unpublished) Exclude_accounts.csv file and filtered from the full tweet datasets in Depersonalising_data.ipynb 

# Covid traces

As covid and related themes did not show up prominently in the content analysis of tweet subsets, a keyword search was conducted across the fan tweet datasets to assess if these were only accidentally excluded. 


In [19]:
i = 0
df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
print(df_alltwt.columns)
twts = df_alltwt['tweet']
twts[:5]


Index(['Unnamed: 0', 'id', 'created_at', 'tweet', 'source', 'language',
       'user_id', 'user_screen_name', 'user_name', 'user_description',
       'user_language', 'user_location', 'user_created_at',
       'user_followers_count', 'user_friends_count', 'user_statuses_count',
       'user_favorites_count', 'user_verified', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_user_screen_name',
       'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_user_screen_name', 'retweeted_status_user_name',
       'retweeted_status_user_description',
       'retweeted_status_user_friends_count',
       'retweeted_status_user_statuses_count',
       'retweeted_status_user_followers_count',
       'retweeted_status_retweet_count', 'retweeted_status_favorite_count',
       'retweeted_status_reply_count', 'quoted_status_id',
       'quoted_status_user_id', 'quoted_status_user_screen_name',
       'quoted_status_user_name', 'quoted_status_user_description',


0    RT @MygNamm01: 🎉แจกคอน Muster​ BTS2021​\n✨เพื่...
1    RT @97mmari: ยิ้มกวนมากโอ้ย55555555555\n#SOWOO...
2    RT @sawok_2minn: จีมินมาในแชท 😭 #SOWOOZOO \n🐥ผ...
3    RT @mxnt_JK: แจกอีกค่ะ แจก 50 ฿ \n\n ⊹🐻. 𖧷 🧺 ꕀ...
4    Happy 8th Anniversary @BTS_twt! \n사랑해요 💜 #BTS8...
Name: tweet, dtype: object

In [20]:
kws = ['remote', 'virtual', 'pandemic', 'covid', 'lockdown', 'restriction']
kw = kws[0]
print(kw)
twts = pd.Series(twts)
sub_twt = twts.loc[(twts.str.contains(kw, case=False,regex=False))]
print(len(sub_twt))
sub_twt = sub_twt.unique()
print(len(sub_twt))

remote
1
1


In [21]:
for t in sub_twt:
    print(t+'\n')

Comedians may lose their career #SOWOOZOO #BTSFESTA2021 #BTSMusterSoWooZoo2021 #8YearsToInfinityWithBTS #BTS @BTS_twt QT @mandakkoo: btw this i canttjsj when they asked army to turn on the phone flashlight then open the window at home to wave it outside, 
JM: *opened the window* oh! are u army too?
RM: no im searching for my remote—
JIN: who search remote outside by opening the window?! https://t.co/UpcmGklnRf



# Hashtag use

In [17]:
i = 3
df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
print(len(df_alltwt))
print(df_alltwt.columns)
twts = df_alltwt['tweet']
twts[:5]

twts[:5]

116323
Index(['Unnamed: 0', 'id', 'created_at', 'tweet', 'source', 'language',
       'user_id', 'user_screen_name', 'user_name', 'user_description',
       'user_language', 'user_location', 'user_created_at',
       'user_followers_count', 'user_friends_count', 'user_statuses_count',
       'user_favorites_count', 'user_verified', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_user_screen_name',
       'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_user_screen_name', 'retweeted_status_user_name',
       'retweeted_status_user_description',
       'retweeted_status_user_friends_count',
       'retweeted_status_user_statuses_count',
       'retweeted_status_user_followers_count',
       'retweeted_status_retweet_count', 'retweeted_status_favorite_count',
       'retweeted_status_reply_count', 'quoted_status_id',
       'quoted_status_user_id', 'quoted_status_user_screen_name',
       'quoted_status_user_name', 'quoted_status_user_descrip

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


0    RT @btsvotingorg: 🔔 ARMY NOTICE!\n\nPrepare fo...
1    RT @seganewsnavi: セガ ラッキーくじオンライン\n🍨 TinyTAN  S...
2    「BTS JAPAN OFFICIAL SHOP」カスタマーセンター電話番号変更のお知らせ\...
3    RT @WFVOTING: ‼️ MUTIRÃO GLOBAL MAMA\n\nB-ARMY...
4    RT @btsvotingorg: THIS IS INSANE! KEEP YOUR VO...
Name: tweet, dtype: object

In [22]:
#kws = ['#PTD_On_Stage','#PTDonStage','#PermissiontoDance','#PermissionToDanceOnStage','#PermissiontoDance_Onstage']
kws = ['#PTD','#PTD_ON_STAGE_LA','#Permission']
kw = kws[2]
print(kw)
twts = pd.Series(twts)
sub_twt = twts.loc[(twts.str.contains(kw, case=False,regex=False))]
print(len(sub_twt))
# sub_twt = sub_twt.unique()
# print(len(sub_twt))

#Permission
177


In [19]:
for i in range(5,10):
    print(sub_twt.iloc[i])

RT @josie_huang: I talked to folks at KTown's Ahgassi Gopchang &amp; other LA spots about the boost BTS fans are giving businesses at a challenging time. #PTD_ON_STAGE_LA https://t.co/Mh9ug6KDfe
RT @josie_huang: I talked to folks at KTown's Ahgassi Gopchang &amp; other LA spots about the boost BTS fans are giving businesses at a challenging time. #PTD_ON_STAGE_LA https://t.co/Mh9ug6KDfe
RT @jeoniies: drinks with @rubberdeokies ☺☺ 
#PTD_ON_STAGE_LA https://t.co/pzvYcF2OI2
RT @bulletproofkp: the night of the 27th I watched this army cry her eyes out on zoom and honestly same 😂😂 @BTS_twt #PTD_ON_STAGE_LA #bts https://t.co/mqtx47lLIZ
RT @josie_huang: I talked to folks at KTown's Ahgassi Gopchang &amp; other LA spots about the boost BTS fans are giving businesses at a challenging time. #PTD_ON_STAGE_LA https://t.co/Mh9ug6KDfe


In [None]:
PTD_LA4

#PTD
8637

#PTD_ON_STAGE_LA
7917

#Permission
177


In [None]:
For PTD on stage
#permission
6362

#PTD_On_Stage
46321

#PTDonStage
2631

#PermissiontoDance
6357

#PermissionToDanceOnStage
5959

#PermissiontoDance_Onstage
142

# Language stats

In [33]:
i = 0
df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
print(len(df_alltwt))
print(df_alltwt.columns)
twts = df_alltwt['language']
twts[:5]

224733
Index(['Unnamed: 0', 'id', 'created_at', 'tweet', 'source', 'language',
       'user_id', 'user_screen_name', 'user_name', 'user_description',
       'user_language', 'user_location', 'user_created_at',
       'user_followers_count', 'user_friends_count', 'user_statuses_count',
       'user_favorites_count', 'user_verified', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_user_screen_name',
       'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_user_screen_name', 'retweeted_status_user_name',
       'retweeted_status_user_description',
       'retweeted_status_user_friends_count',
       'retweeted_status_user_statuses_count',
       'retweeted_status_user_followers_count',
       'retweeted_status_retweet_count', 'retweeted_status_favorite_count',
       'retweeted_status_reply_count', 'quoted_status_id',
       'quoted_status_user_id', 'quoted_status_user_screen_name',
       'quoted_status_user_name', 'quoted_status_user_descrip

0    th
1    ja
2    th
3    fr
4    th
Name: language, dtype: object

In [34]:
print(len(twts.unique()))
twts.value_counts()

43


en     99819
th     55457
ko     31434
ja     11769
es      6817
in      4930
und     4628
pt      4177
tl      1009
tr       665
fr       599
ru       538
vi       442
et       424
ar       401
it       292
de       229
pl       152
ht       107
fa       105
cy        92
nl        83
zh        80
hu        70
ro        59
lt        53
da        52
ca        32
fi        28
uk        28
eu        26
no        25
sv        24
hi        21
cs        16
is        14
sl        11
lv        11
el         4
bg         4
lo         3
ckb        2
sr         1
Name: language, dtype: int64

In [35]:
i = 2
df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
print(len(df_alltwt))
print(df_alltwt.columns)
twts = df_alltwt['language']
twts[:5]

228798
Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'created_at', 'tweet', 'source',
       'language', 'user_id', 'user_screen_name', 'user_name',
       'user_description', 'user_language', 'user_location', 'user_created_at',
       'user_followers_count', 'user_friends_count', 'user_statuses_count',
       'user_favorites_count', 'user_verified', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_user_screen_name',
       'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_user_screen_name', 'retweeted_status_user_name',
       'retweeted_status_user_description',
       'retweeted_status_user_friends_count',
       'retweeted_status_user_statuses_count',
       'retweeted_status_user_followers_count',
       'retweeted_status_retweet_count', 'retweeted_status_favorite_count',
       'retweeted_status_reply_count', 'quoted_status_id',
       'quoted_status_user_id', 'quoted_status_user_screen_name',
       'quoted_status_user_name', 'quoted_sta

0    en
1    en
2    en
3    en
4    en
Name: language, dtype: object

In [36]:
print(len(twts.unique()))
twts.value_counts()

41


en       171654
ko        25498
und       14095
th         7775
in         1800
es         1672
pt         1668
tl         1323
nl          661
tr          522
ja          350
et          241
ar          235
ht          184
fr          178
vi          111
de           97
da           91
it           82
ru           62
pl           59
no           57
lt           44
hu           42
lv           41
zh           37
hi           30
sl           28
fi           26
sv           21
ro           21
cy           21
eu           20
ca           19
cs           16
is            8
f             4
uk            2
fa            1
44077         1
bg            1
Name: language, dtype: int64

In [37]:
171654/228798

0.750242572050455

In [38]:
99819/224733

0.4441670782662092

# Check Users overlaps

Extra userids from each full field databases and check the ratio of overlap, in unique users.

Maybe treat posting ids seperately from rted ids

In [60]:
posterids = {}
rtedids = {}
for i in range(len(Concerts)):
    tag = Concerts.loc[i,'tag']
    df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
    posterids[tag] = df_alltwt['user_screen_name'].value_counts()
    rtedids[tag] = df_alltwt.loc[df_alltwt['retweeted_status_user_screen_name'].notna(),'retweeted_status_user_screen_name'].value_counts()
    # 'retweeted_status_user_screen_name' 'retweeted_status_user_id'


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [64]:
A = pd.DataFrame(index = posterids.keys(), columns = posterids.keys())
B = A.copy()
for t1 in posterids.keys():
    s1 = posterids[t1]
    for t2 in posterids.keys():
        s2 = posterids[t2]
        V = set(list(s1.index)).intersection(list(s2.index))
        A.loc[t1,t2] = len(V)/len(s1)
        B.loc[t1,t2] = s1[V].sum()/s1.sum()
A

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,1.0,0.187532,0.150691,0.063265,0.013398,0.032064
SWZ_D2,0.386687,1.0,0.143823,0.06559,0.015901,0.034406
PTD_ON,0.200845,0.092965,1.0,0.158206,0.03851,0.087929
PTD_LA4,0.168192,0.084567,0.315568,1.0,0.053706,0.124846
PTD_ON_Alt1,0.205518,0.118295,0.443221,0.309879,1.0,0.281176
PTD_ON_Alt2,0.159016,0.082751,0.327177,0.232893,0.090905,1.0


In [65]:
B

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,1.0,0.329689,0.175769,0.076624,0.016104,0.039936
SWZ_D2,0.537415,1.0,0.171883,0.082133,0.022773,0.044142
PTD_ON,0.295213,0.155638,1.0,0.312001,0.124505,0.215622
PTD_LA4,0.210869,0.119276,0.4778,1.0,0.150466,0.244053
PTD_ON_Alt1,0.237898,0.157408,0.556778,0.407075,1.0,0.39979
PTD_ON_Alt2,0.176102,0.100667,0.40006,0.291531,0.160186,1.0


In [66]:
A = pd.DataFrame(index = posterids.keys(), columns = posterids.keys())
B = A.copy()
for t1 in posterids.keys():
    s1 = posterids[t1]
    for t2 in posterids.keys():
        s2 = posterids[t2]
        V = set(list(s1.index)).intersection(list(s2.index))
        A.loc[t1,t2] = len(V)#/len(s1)
        B.loc[t1,t2] = s1[V].sum()#/s1.sum()
        
A

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,110093,20646,16590,6965,1475,3530
SWZ_D2,20646,53392,7679,3502,849,1837
PTD_ON,16590,7679,82601,13068,3181,7263
PTD_LA4,6965,3502,13068,41411,2224,5170
PTD_ON_Alt1,1475,849,3181,2224,7177,2018
PTD_ON_Alt2,3530,1837,7263,5170,2018,22199


In [67]:
B

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,224733,74092,39501,17220,3619,8975
SWZ_D2,59753,111186,19111,9132,2532,4908
PTD_ON,67543,35609,228794,71384,28486,49333
PTD_LA4,24535,13878,55593,116352,17507,28396
PTD_ON_Alt1,3396,2247,7948,5811,14275,5707
PTD_ON_Alt2,8243,4712,18726,13646,7498,46808


In [68]:
A = pd.DataFrame(index = rtedids.keys(), columns = rtedids.keys())
B = A.copy()
for t1 in rtedids.keys():
    s1 = rtedids[t1]
    for t2 in rtedids.keys():
        s2 = rtedids[t2]
        V = set(list(s1.index)).intersection(list(s2.index))
        A.loc[t1,t2] = len(V)/len(s1)
        B.loc[t1,t2] = s1[V].sum()/s1.sum()
        
A

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,1.0,0.231513,0.044575,0.033172,0.013822,0.016586
SWZ_D2,0.263987,1.0,0.06186,0.043735,0.019701,0.023641
PTD_ON,0.039534,0.048115,1.0,0.291143,0.131474,0.142813
PTD_LA4,0.031631,0.036573,0.313015,1.0,0.124217,0.137397
PTD_ON_Alt1,0.028674,0.035842,0.307527,0.270251,1.0,0.216487
PTD_ON_Alt2,0.031352,0.03919,0.304376,0.272371,0.197257,1.0


In [69]:
B

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,1.0,0.907821,0.349339,0.300364,0.109645,0.131148
SWZ_D2,0.527515,1.0,0.492507,0.467672,0.2,0.221514
PTD_ON,0.605449,0.636276,1.0,0.935319,0.80834,0.880152
PTD_LA4,0.165668,0.180223,0.850364,1.0,0.783273,0.807572
PTD_ON_Alt1,0.069106,0.144259,0.837443,0.766531,1.0,0.806659
PTD_ON_Alt2,0.175638,0.249426,0.942492,0.940712,0.922956,1.0


In [70]:
A = pd.DataFrame(index = rtedids.keys(), columns = rtedids.keys())
for t1 in rtedids.keys():
    s1 = rtedids[t1]
    for t2 in rtedids.keys():
        s2 = rtedids[t2]
        V = set(list(s1.index)).intersection(list(s2.index))
        A.loc[t1,t2] = len(V)#/len(s1)
        B.loc[t1,t2] = s1[V].sum()#/s1.sum()
        
A

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,2894,670,129,96,40,48
SWZ_D2,670,2538,157,111,50,60
PTD_ON,129,157,3263,950,429,466
PTD_LA4,96,111,950,3035,377,417
PTD_ON_Alt1,40,50,429,377,1395,302
PTD_ON_Alt2,48,60,466,417,302,1531


In [71]:
B

Unnamed: 0,SWZ_D1,SWZ_D2,PTD_ON,PTD_LA4,PTD_ON_Alt1,PTD_ON_Alt2
SWZ_D1,190998,173392,66723,57369,20942,25049
SWZ_D2,47028,89150,43907,41693,17830,19748
PTD_ON,129821,136431,214421,200552,173325,188723
PTD_LA4,17836,19403,91551,107661,84328,86944
PTD_ON_Alt1,880,1837,10664,9761,12734,10272
PTD_ON_Alt2,7498,10648,40235,40159,39401,42690
