# Keyword searchs in tweet content

Using original (filtered) tweet databases collected during livestreamed BTS concerts in 2021, show search criteria and results for mentions of themes like covid. 

This notebook is documents searches on data that cannot be shared openly. Reduced depersonalised datasets are published elsewhere.

In [11]:
import sys
import os
import time
import datetime as dt
import math
import numpy as np 
import scipy as sp
import pandas as pd

In [12]:
# import respy functions from twt.py file
%load_ext autoreload
%autoreload 1
%aimport twt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html

dtype_map = {'id': 'Int64', 'created_at':str, 'tweet':str, 'source':str, 'language':str, 'user_id': 'Int64',
       'user_screen_name':str, 'user_name':str, 'user_description':str, 'user_language':str,
       'user_location':str, 'user_created_at':str, 'user_followers_count': 'Int64',
       'user_friends_count': 'Int64', 'user_statuses_count': 'Int64', 'user_favorites_count': 'Int64',
       'user_verified':str, 'in_reply_to_status_id': 'Int64', 'in_reply_to_user_id': 'Int64',
       'in_reply_to_user_screen_name':str, 'retweeted_status_id': 'Int64',
       'retweeted_status_user_id': 'Int64', 'retweeted_status_user_screen_name':str,
       'retweeted_status_user_name':str, 'retweeted_status_user_description':str,
       'retweeted_status_user_friends_count': 'Int64',
       'retweeted_status_user_statuses_count': 'Int64',
       'retweeted_status_user_followers_count': 'Int64',
       'retweeted_status_retweet_count': 'Int64', 'retweeted_status_favorite_count': 'Int64',
       'retweeted_status_reply_count': 'Int64', 'quoted_status_id': 'Int64',
       'quoted_status_user_id': 'Int64', 'quoted_status_user_screen_name':str,
       'quoted_status_user_name':str, 'quoted_status_user_description':str,
       'quoted_status_user_friends_count': 'Int64', 'quoted_status_user_statuses_count': 'Int64',
       'quoted_status_user_followers_count': 'Int64', 'quoted_status_retweet_count': 'Int64',
       'quoted_status_favorite_count': 'Int64', 'quoted_status_reply_count': 'Int64'}

In [15]:
raw_dir = '/Users/finn/Desktop/Current_Projects/BTS_twitter/twt_Analysis/'
Concerts = pd.DataFrame(columns=['tag','raw_loc','fullfeilds_loc','dep_loc',
                                 'raw_twt_db','full_twt_db','fan_twt_db','dep_twt_db',
                                 'event_file','event_offset','event_reduction','Long_name','sampling','Program'])
Concerts.loc[0,:]={'tag': 'SWZ_D1','raw_loc':'data/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Fan_tweets_H_Sowoozoo_D1.csv','full_twt_db':'All_Tweets_SWZ_D1.csv',
             'fan_twt_db':'fan_Tweets_SWZ_D1.csv','dep_twt_db':'fan_Tweets_SWZ_D1_reduced.csv',
             'event_file':'Setlists_sowoozoo_D1.csv',
             'event_offset':'6MIN','event_reduction':[1,2,3,6,8,9,10,12,13,15,16,19,20,21,22,23,25,26,27,28],
             'Long_name':'Sowoozoo Concert Day 1','sampling':'#SOWOOZOO','Program':'SWZ'}
Concerts.loc[1,:]={'tag': 'SWZ_D2','raw_loc':'data/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Fan_tweets_H_Sowoozoo_D2.csv','full_twt_db':'All_Tweets_SWZ_D2.csv',
             'fan_twt_db':'fan_Tweets_SWZ_D2.csv','dep_twt_db':'fan_Tweets_SWZ_D2_reduced.csv',
             'event_file':'Setlists_sowoozoo_D2.csv',
             'event_offset':'108S','event_reduction':[1,2,3,6,8,9,10,12,13,15,16,19,20,21,22,23,25,26,27,28],
             'Long_name':'Sowoozoo Concert Day 2','sampling':'#SOWOOZOO','Program':'SWZ'}
Concerts.loc[2,:]={'tag': 'PTD_ON','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'FullPTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON.csv','dep_twt_db':'fan_Tweets_PTD_ON_reduced.csv',
             'event_file':'Setlists_PTD_ON.csv','event_offset':'25S','event_reduction':[1,2,3,6,7,8,9,11,12,14,15,17,18,21,22,28,29,32,33,34,36,37,38,39],
             'Long_name':'Permission to Dance on Stage','sampling':'Kpop Stream','Program':'PTD'}
Concerts.loc[3,:]={'tag': 'PTD_LA4','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'PTD_LA4_Fan_tweets_FULLSTREAM.csv','full_twt_db':'All_Tweets_PTD_LA4.csv',
             'fan_twt_db':'fan_Tweets_PTD_LA4.csv','dep_twt_db':'fan_Tweets_PTD_LA4_reduced.csv',
             'event_file':'Setlists_PTD_LA4.csv','event_offset':'40S','event_reduction':[1,2,3,6,7,8,9,11,12,14,15,17,18,21,22,28,29,32,33,34,36,37,38,39],
             'Long_name':'Permission to Dance LA Day 4','sampling':'Kpop Stream','Program':'PTD'}
Concerts.loc[4,:]={'tag': 'PTD_ON_Alt1','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Alt1PTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON_Alt1.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON_Alt1.csv','dep_twt_db':'fan_Tweets_PTD_ON_Alt1_reduced.csv',
             'event_file':'','event_offset':'','event_reduction':[],
             'Long_name':'Week prior to PTD On Stage','sampling':'Kpop Stream','Program':''}
Concerts.loc[5,:]={'tag': 'PTD_ON_Alt2','raw_loc':'data/PTD/','fullfeilds_loc': '../StreamData/', 'dep_loc':'./data/',
             'raw_twt_db':'Alt2PTD_Fan_tweets_PTD_ON_STAGE.csv','full_twt_db':'All_Tweets_PTD_ON_Alt2.csv',
             'fan_twt_db':'fan_Tweets_PTD_ON_Alt2.csv','dep_twt_db':'fan_Tweets_PTD_ON_Alt2_reduced.csv',
             'event_file':'','event_offset':'','event_reduction':[],
             'Long_name':'Week following PTD On Stage','sampling':'Kpop Stream','Program':''}
Concerts

Unnamed: 0,tag,raw_loc,fullfeilds_loc,dep_loc,raw_twt_db,full_twt_db,fan_twt_db,dep_twt_db,event_file,event_offset,event_reduction,Long_name,sampling,Program
0,SWZ_D1,data/,../StreamData/,./data/,Fan_tweets_H_Sowoozoo_D1.csv,All_Tweets_SWZ_D1.csv,fan_Tweets_SWZ_D1.csv,fan_Tweets_SWZ_D1_reduced.csv,Setlists_sowoozoo_D1.csv,6MIN,"[1, 2, 3, 6, 8, 9, 10, 12, 13, 15, 16, 19, 20,...",Sowoozoo Concert Day 1,#SOWOOZOO,SWZ
1,SWZ_D2,data/,../StreamData/,./data/,Fan_tweets_H_Sowoozoo_D2.csv,All_Tweets_SWZ_D2.csv,fan_Tweets_SWZ_D2.csv,fan_Tweets_SWZ_D2_reduced.csv,Setlists_sowoozoo_D2.csv,108S,"[1, 2, 3, 6, 8, 9, 10, 12, 13, 15, 16, 19, 20,...",Sowoozoo Concert Day 2,#SOWOOZOO,SWZ
2,PTD_ON,data/PTD/,../StreamData/,./data/,FullPTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON.csv,fan_Tweets_PTD_ON.csv,fan_Tweets_PTD_ON_reduced.csv,Setlists_PTD_ON.csv,25S,"[1, 2, 3, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, ...",Permission to Dance on Stage,Kpop Stream,PTD
3,PTD_LA4,data/PTD/,../StreamData/,./data/,PTD_LA4_Fan_tweets_FULLSTREAM.csv,All_Tweets_PTD_LA4.csv,fan_Tweets_PTD_LA4.csv,fan_Tweets_PTD_LA4_reduced.csv,Setlists_PTD_LA4.csv,40S,"[1, 2, 3, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, ...",Permission to Dance LA Day 4,Kpop Stream,PTD
4,PTD_ON_Alt1,data/PTD/,../StreamData/,./data/,Alt1PTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON_Alt1.csv,fan_Tweets_PTD_ON_Alt1.csv,fan_Tweets_PTD_ON_Alt1_reduced.csv,,,[],Week prior to PTD On Stage,Kpop Stream,
5,PTD_ON_Alt2,data/PTD/,../StreamData/,./data/,Alt2PTD_Fan_tweets_PTD_ON_STAGE.csv,All_Tweets_PTD_ON_Alt2.csv,fan_Tweets_PTD_ON_Alt2.csv,fan_Tweets_PTD_ON_Alt2_reduced.csv,,,[],Week following PTD On Stage,Kpop Stream,


# Objections to being cited/used for research

The data collected was only of accounts that were public at the time of sampling, and by Twitters terms of service and the Twitter API license used by the university, it was legal to collect these status updates and statistics. However there have been times when twitter users have been very concerned about the way their data is collected and converted into produces for media and research, commercial and academic. 

Out of respect for these concerns, we have taken measures to protect the users whos data we have studied, taking many steps to mitigating risks of harmful visibility via the published datasets and reporting on this research. 

One way in which twitter users have tried to discourage (legal but unethical) citation and data collection is to mark their resistance in their twitter bios. This is a field that we did not use in our research but it is collected by the Twitter API, so we try to find instances in the material we have. 

In [17]:
i = 0
df_alltwt=pd.read_csv(raw_dir + Concerts.loc[i,'raw_loc'] + Concerts.loc[i,'raw_twt_db'],
                 lineterminator='\n',low_memory=False)
df_bios = df_alltwt.loc[df_alltwt['user_description'].notna(),:].copy()
twts = df_bios['user_description'].unique()
print([len(df_alltwt),len(df_bios),len(twts)])

twts[:5]

[225993, 189371, 90454]


array(['I purple you 💜💜',
       '99line/BTS/🐤ペン/推しカプ→→🐰🐥/🦄🐭/🐨🐹/🐯🐣/rps/20↑📛 交換垢→@miichim_koukan',
       'เราจะรวยเมื่อไหร่คะ #อิงอิงจะเเจก',
       'Association BTS France 🇫🇷 Fanbase française dédiée à @BTS_twt et au fandom ARMY 🐋💜 Back Up @ProjetBtsFrance | Site http://www.btsfrance.fr/',
       '❥\n𝐼𝑛𝑠𝑡𝑎𝑔𝑟𝑎𝑚 : 𝑏𝑏𝑏𝑏𝑏𝑏𝑜𝑤_𝑏\n💜💙❤🧡💚💛🖤'], dtype=object)

In [18]:
# example search
kw ='무단사용 및 도용금지'
twts = pd.Series(twts)
sub_twt = twts.loc[(twts.str.contains(kw, case=False,regex=False))]
for t in sub_twt:
    print(t+'\n')
    

7명 모두가 소듕한 아미입니다💜                                                                    주로 #어썸_작업 | 좋은날에 #어썸_이벵 |
후기는 #어썸한후기 | 무단사용 및 도용금지🙅‍♀️ | 나눔물품 금전거래 금지🚫



The above bio message is an example of usage restrictions expressed by twitter users in addition to the legal access granted by the API licensing agreement. In this case, the user requests that people do not repost the art they share as images in tweets. The API does not collect embedded images, only hyperlinks back to image locations, so our research use of this user's tweets is not in conflict with their expressed restrictions. 

A similar search was used to identify the 12 users who had made explicit requests in their bios to not have any tweet content used without explicit permission with text strings like: "🚫 please do not cite my tweets w/o my express consent" "💥This acct DOES NOT consent to being used for research purposes 💥" "DON'T USE MY Tweets/screenshot them without MY express consent!"

The ids of users identified were retained in the (unpublished) Exclude_accounts.csv file and filtered from the full tweet datasets in Depersonalising_data.ipynb 

# Covid traces

As covid and related themes did not show up prominently in the content analysis of tweet subsets, a keyword search was conducted across the fan tweet datasets to assess if these were only accidentally excluded. 


In [19]:
i = 0
df_alltwt=pd.read_csv(Concerts.loc[i,'fullfeilds_loc'] + Concerts.loc[i,'fan_twt_db'])
print(df_alltwt.columns)
twts = df_alltwt['tweet']
twts[:5]


Index(['Unnamed: 0', 'id', 'created_at', 'tweet', 'source', 'language',
       'user_id', 'user_screen_name', 'user_name', 'user_description',
       'user_language', 'user_location', 'user_created_at',
       'user_followers_count', 'user_friends_count', 'user_statuses_count',
       'user_favorites_count', 'user_verified', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'in_reply_to_user_screen_name',
       'retweeted_status_id', 'retweeted_status_user_id',
       'retweeted_status_user_screen_name', 'retweeted_status_user_name',
       'retweeted_status_user_description',
       'retweeted_status_user_friends_count',
       'retweeted_status_user_statuses_count',
       'retweeted_status_user_followers_count',
       'retweeted_status_retweet_count', 'retweeted_status_favorite_count',
       'retweeted_status_reply_count', 'quoted_status_id',
       'quoted_status_user_id', 'quoted_status_user_screen_name',
       'quoted_status_user_name', 'quoted_status_user_description',


0    RT @MygNamm01: 🎉แจกคอน Muster​ BTS2021​\n✨เพื่...
1    RT @97mmari: ยิ้มกวนมากโอ้ย55555555555\n#SOWOO...
2    RT @sawok_2minn: จีมินมาในแชท 😭 #SOWOOZOO \n🐥ผ...
3    RT @mxnt_JK: แจกอีกค่ะ แจก 50 ฿ \n\n ⊹🐻. 𖧷 🧺 ꕀ...
4    Happy 8th Anniversary @BTS_twt! \n사랑해요 💜 #BTS8...
Name: tweet, dtype: object

In [20]:
kws = ['remote', 'virtual', 'pandemic', 'covid', 'lockdown', 'restriction']
kw = kws[0]
print(kw)
twts = pd.Series(twts)
sub_twt = twts.loc[(twts.str.contains(kw, case=False,regex=False))]
print(len(sub_twt))
sub_twt = sub_twt.unique()
print(len(sub_twt))

remote
1
1


In [21]:
for t in sub_twt:
    print(t+'\n')

Comedians may lose their career #SOWOOZOO #BTSFESTA2021 #BTSMusterSoWooZoo2021 #8YearsToInfinityWithBTS #BTS @BTS_twt QT @mandakkoo: btw this i canttjsj when they asked army to turn on the phone flashlight then open the window at home to wave it outside, 
JM: *opened the window* oh! are u army too?
RM: no im searching for my remote—
JIN: who search remote outside by opening the window?! https://t.co/UpcmGklnRf

