### user input

In [None]:
# select appropriate google drive path and insert code
from google.colab import drive
drive.mount('drive')
# drive = 'drive/My Drive/Spring 2021/Stringer/Data/'
drive = '/content/drive/MyDrive/Stringer/Data/'

Mounted at drive


In [None]:
# code to ingest user input and then turn it into list of search terms
searchTerm = [x for x in input("Please enter a list of query terms\n").split(', ')]
print("\nInput Values: ", searchTerm)

# for testing 
# searchTerm = ['vaccine', 'covid'] 

In [None]:
# prior data to search for hashtags to expand by
priorData = 'search15000_covid.csv' 
# output file
jaccardOutput = 'Jaccard.csv'
# jaccard threshold for new search terms at or above
thresh = 0.03

### setup

In [None]:
import pandas as pd
from collections import defaultdict

In [None]:
# read in data
data = pd.read_csv(drive + priorData)
print(data.shape)
data = data[data['hashtags'] != '[]']
data['hashtags'] = data['hashtags'].str.lower()
from ast import literal_eval
# clean hashtags
data['hashtags'] = data['hashtags'].apply(literal_eval)
print(data.shape)
data.head(2)

(15000, 12)
(2235, 12)


Unnamed: 0,tweet_id,user,created_on,tweet_text,fav_count,user_description,user_verified,user_follower_count,is_retweet,hashtags,is_quote,quote_text
7,1380080898230587392,Twilogsteps,2021-04-08 08:51:41,The latest Twilog steps! https://t.co/67dLDCXV...,0,█►Για όλα και όχι για όλους◄█ 🧭️,False,336,1,"[digital, covid_19]",0,
10,1380080897865744385,PaulineLeeson,2021-04-08 08:51:41,"From 12 noon today, people aged 40-44 will be ...",0,Chief Executive of Children in Northern Irelan...,False,1044,1,[covid19],0,


In [None]:
# get list of hashtags from the prior data
lst = data['hashtags'].tolist()
# remove duplicates
related_hashtags = set([word for sublist in lst for word in sublist])
len(related_hashtags)

1475

### Jaccard Similarity

In [None]:
# keep track of each hastags appear in which rows 
seeds = related_hashtags
tokenOccurences = defaultdict(list)

for row in data.itertuples():
  # print(row.Index, row.hashtags)
  for token in row.hashtags:
    tokenOccurences[token].append(row.Index)
    # print(token, row.Index)

In [None]:
# how often was that hashtag used
hashtag_popularity = pd.DataFrame([{'hashtag': seed, 'popularity': len(set(tokenOccurences[seed]))} for seed in seeds])
hashtag_popularity = hashtag_popularity.sort_values(by='popularity', ascending = False)
print('10+ times:', len(hashtag_popularity[hashtag_popularity['popularity'] > 10]))
print('number of hashtags:', len(hashtag_popularity))
hashtag_popularity.head(5)

10+ times: 58
number of hashtags: 1475


Unnamed: 0,hashtag,popularity
1116,covid19,578
661,coronavirus,242
539,vaccine,197
904,covid,192
443,maharashtra,104


In [None]:
tokenJaccard = []
cooccuredTokens = set()
for seed in seeds:

  sents = set(tokenOccurences[seed])

# find the cooccuring tokens for each seed
  for row in data.itertuples():
    if seed in row.hashtags:
      for token in row.hashtags:
        # print(row.hashtags, token)
        cooccuredTokens.add(token)

  progress = 0
  total = len(cooccuredTokens)
  for tok in cooccuredTokens:
    # index of docs that co-occur token appears in 
    sents1 = set(tokenOccurences[tok]) 
    # calculated jaccard: d(seed) intersection d(co-occur token) / d(seed) union d(co-occur token)
    jaccard = len(sents & sents1) / float(len(sents | sents1)) 
    tokenJaccard.append({'seed': seed, 'hashtag': tok, 'jaccard': jaccard})
    progress += 1

### Relevant Jaccard

In [None]:
tokenJaccard = pd.DataFrame(tokenJaccard)
tokenJaccard = tokenJaccard.sort_values(by='jaccard', ascending=False)

# subsetting to only include search terms from user input
tokenJaccardSearch = tokenJaccard[tokenJaccard['seed'].isin(searchTerm)] 
print(len(tokenJaccardSearch))
tokenJaccardSearch.head(20)

2421


Unnamed: 0,seed,hashtag,jaccard
390119,vaccine,vaccine,1.0
835745,covid,covid,1.0
389795,vaccine,maharashtra,0.308696
389835,vaccine,astrazeneca,0.134783
389632,vaccine,covid19,0.129738
390432,vaccine,vaccination,0.118483
390346,vaccine,ecogiftsday,0.111675
390534,vaccine,snow,0.111675
390625,vaccine,gift,0.111675
390447,vaccine,thursdayvibes,0.111675


In [None]:
tokenJaccardSearch.to_csv(drive + jaccardOutput, index=None)

In [None]:
# keep hashtags with score above threshold 
tokenJaccardThreshold = tokenJaccardSearch[tokenJaccardSearch['jaccard'] >= thresh]
extended_hashtag = tokenJaccardThreshold['hashtag'].values.tolist()
extended_hashtag

['vaccine',
 'covid',
 'maharashtra',
 'astrazeneca',
 'covid19',
 'vaccination',
 'ecogiftsday',
 'snow',
 'gift',
 'thursdayvibes',
 'giveaway',
 'ukgiftam',
 'thursdaymorning',
 'ukgifthour',
 'sales',
 'ukweather',
 'summerof2021',
 'covidー19',
 'maskup',
 'covid_19',
 'corona',
 'coronavirus',
 'corona',
 'india',
 'school',
 'nightcurfew',
 'tamilnadu',
 'covidvaccine',
 'kgmu',
 'iitroorkee',
 'lucknow',
 'vaccineforyoungistaan',
 'mumbai',
 'coronavirus',
 'tngovt',
 'pandemic']