# RESEARCH QUESTION 2 - OUTPUT DIVERSIFICATION

In [1]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import csv
from collections import Counter
from config import *
import math

## 1. LOAD DATA

In [2]:
# Load tweets with cluster information already assigned

tweets = pd.read_csv("tweets_cluster.csv")
tweets.drop(columns = ["Unnamed: 0"], inplace = True)
tweets.head()

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,filter_level,lang,timestamp_ms,display_text_range,extended_tweet,extended_entities,possibly_sensitive,withheld_in_countries,scopes,cluster
0,Sun Nov 22 10:54:27 +0000 2020,1330464657086275585,1330464657086275585,Respect @NIkosdN,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,low,en,1606042467744,,,,,,,1.0
1,Sun Nov 22 10:54:27 +0000 2020,1330464657069531137,1330464657069531137,@TheJasonPugh Putting out the message that 🇨🇦 ...,"<a href=""http://twitter.com/download/android"" ...",True,1.330229e+18,1.330229e+18,457136900.0,457136900.0,...,low,en,1606042467740,"[14, 140]","{'full_text': ""@TheJasonPugh Putting out the m...",,,,,0.0
2,Sun Nov 22 10:54:28 +0000 2020,1330464658650750978,1330464658650750978,@EleriTudor Boris Johnson gets to dictate how ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,1.330464e+18,1.330464e+18,1618683000.0,1618683000.0,...,low,en,1606042468117,"[12, 140]","{'full_text': ""@EleriTudor Boris Johnson gets ...",,,,,1.0
3,Sun Nov 22 10:54:28 +0000 2020,1330464658235518977,1330464658235518977,#G20 #COVID19 is hitting every country in the ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,,,,,...,low,en,1606042468018,"[0, 140]",{'full_text': '#G20 #COVID19 is hitting every ...,,False,,,2.0
4,Sun Nov 22 10:54:28 +0000 2020,1330464658940112896,1330464658940112896,"500,000 Nigerians to benefit from the Payroll ...","<a href=""https://mobile.twitter.com"" rel=""nofo...",True,,,,,...,low,en,1606042468186,,"{'full_text': '500,000 Nigerians to benefit fr...",,False,,,0.0


## 2. PREPROCESS TEXT

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\99per\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from collections import defaultdict
import re
from array import array
from numpy import linalg as la

In [5]:
def getTerms(text):
    """
    Preprocess the tweet text removing stop words, stemming, transforming to lowercase and return the tokens of 
    the text.
    
    Argument:
    text -- string (tweet) to be preprocessed
    
    Returns:
    text - a list of tokens (words) corresponding to the input tweet after the preprocessing
    """
    
    ## Remove Emojis
    text = text.encode('ascii', 'ignore').decode('ascii')

    ## Remove "RT"
    text = text.replace("RT ", "")
     
    ## Remove URLs, webpages
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', '', text)
    
    ## Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    
    ## Transform in lowercase
    text =  text.lower()

    ## Tokenize the text to get a list of terms
    text = text.split() 

    ## Remove stopwords
    stops = set(stopwords.words("english"))    
    text = [word for word in text if not word in stops] 
    
    ## Perform stemming
    stemming = PorterStemmer()
    text = [stemming.stem(word) for word in text] 
    
    return text

## 3. RANKING SCORE TF-IDF

Search engine based on the tweet text data.

In [6]:
def create_index_tfidf(ids, tweets, data, numDocuments):
    """
    Implement the inverted index for the tweet text and compute the tf, df and idf scores
    
    Argument:
    ids -- tweet ids of the corresponding tweets
    tweets -- collection of tweets
    numDocuments -- total number of tweets
    
    Returns:
    index - the inverted index (implemented through a python dictionary) containing terms as keys and the corresponding 
    list of document these keys appears in (and the positions) as values.
    
    tf - normalized term frequency for each term in each document
    
    df - number of documents each term appear in
    
    idf - inverse document frequency of each term
    """
        
    index = defaultdict(list)
    tf = defaultdict(list)        # term frequencies of terms in documents 
    df = defaultdict(int)         #document frequencies of terms in the corpus
    idf = defaultdict(float)
    tweets_terms = []
    
    for i in range(0, len(tweets)): 
        ## ===============================================================        
        ## create the index for the current doc and store it in termdictPage
        ## termdictPage ==> { ‘term1’: [currentdoc, [list of positions]], ..., ‘termn’: 
        ##                    [currentdoc, [list of positions]]}
        ## ===============================================================
        
        tweet = tweets[i]
        page_id = ids[i]
        
        termdictPage = {}
        
        terms = getTerms(tweet)
        data.loc[i, 'terms'] = " ".join(terms)
        
        ## Iterate over all terms in the tweet
        for position, term in enumerate(terms): 
            try:
                # if the term is already in the index for the current page (termdictPage)
                # append the position to the corresponding list
                termdictPage[term][1].append(position)  
            except:
                # Add the new term as dict key and initialize the array of positions and add the position
                termdictPage[term] = [page_id, array('I', [position])]
            
        ## Merge the current page index with the main index
        for termpage, postingpage in termdictPage.items():
            index[termpage].append(postingpage[0])
        
        # normalize term frequencies
        # Compute the denominator to normalize term frequencies
        # norm is the same for all terms of a document.
        norm = 0
        
        for term, posting in termdictPage.items(): 
            # posting is a list containing doc_id and the list of positions for current term in current document: 
            # posting ==> [currentdoc, [list of positions]] 
            norm += len(posting[1])**2
            
        norm = math.sqrt(norm)

        # calculate the tf(dividing the term frequency by the above computed norm) and df weights
        for term, posting in termdictPage.items():     
            
            # append the tf for current term (tf = term frequency in current doc/norm)
            tf[term].append(np.round(len(posting[1])/norm, 4))  
            
            # increment the document frequency of current term (number of documents containing the current term)
            df[term] += 1  # increment df for current term
            
    # Compute idf 
    for term in df:
        idf[term] = np.round(np.log(float(numDocuments/df[term])), 4)
            
    return index, tf, df, idf, data

In [7]:
%%time
numDocuments = len(tweets)
tweets['terms'] = ['']*numDocuments
index, tf, df, idf, tweets = create_index_tfidf(tweets['id'], tweets['text'], tweets, numDocuments)

Wall time: 39.9 s


In [8]:
tweets.head()

Unnamed: 0,created_at,id,id_str,text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,lang,timestamp_ms,display_text_range,extended_tweet,extended_entities,possibly_sensitive,withheld_in_countries,scopes,cluster,terms
0,Sun Nov 22 10:54:27 +0000 2020,1330464657086275585,1330464657086275585,Respect @NIkosdN,"<a href=""https://mobile.twitter.com"" rel=""nofo...",False,,,,,...,en,1606042467744,,,,,,,1.0,respect nikosdn
1,Sun Nov 22 10:54:27 +0000 2020,1330464657069531137,1330464657069531137,@TheJasonPugh Putting out the message that 🇨🇦 ...,"<a href=""http://twitter.com/download/android"" ...",True,1.330229e+18,1.330229e+18,457136900.0,457136900.0,...,en,1606042467740,"[14, 140]","{'full_text': ""@TheJasonPugh Putting out the m...",,,,,0.0,thejasonpugh put messag need plan lockdown alm...
2,Sun Nov 22 10:54:28 +0000 2020,1330464658650750978,1330464658650750978,@EleriTudor Boris Johnson gets to dictate how ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,1.330464e+18,1.330464e+18,1618683000.0,1618683000.0,...,en,1606042468117,"[12, 140]","{'full_text': ""@EleriTudor Boris Johnson gets ...",,,,,1.0,eleritudor bori johnson get dictat mani human ...
3,Sun Nov 22 10:54:28 +0000 2020,1330464658235518977,1330464658235518977,#G20 #COVID19 is hitting every country in the ...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",True,,,,,...,en,1606042468018,"[0, 140]",{'full_text': '#G20 #COVID19 is hitting every ...,,False,,,2.0,g20 covid19 hit everi countri world caus live ...
4,Sun Nov 22 10:54:28 +0000 2020,1330464658940112896,1330464658940112896,"500,000 Nigerians to benefit from the Payroll ...","<a href=""https://mobile.twitter.com"" rel=""nofo...",True,,,,,...,en,1606042468186,,"{'full_text': '500,000 Nigerians to benefit fr...",,False,,,0.0,500000 nigerian benefit payrol support scheme ...


### RANK DOCUMENTS ACCORDING TO TF-IDF

In [9]:
def rankDocuments(terms, docs, index, idf, tf):
    """
    Perform the ranking of the results of a search based on the tf-idf weights
    
    Argument:
    terms -- list of query terms
    docs -- list of documents, to rank, matching the query
    index -- inverted index data structure
    idf -- inverted document frequencies
    tf -- term frequencies
    
    Returns:
    Print the list of ranked documents
    """
        
    # I'm interested only on the element of the docVector corresponding to the query terms 
    # The remaing elements would became 0 when multiplied to the queryVector
    docVectors = defaultdict(lambda: [0]*len(terms)) 
    queryVector = [0]*len(terms)    

    # compute the norm for the query tf
    query_terms_count = Counter(terms) # get the frequency of each term in the query. 
    
    query_norm = la.norm(list(query_terms_count.values()))
    
    for termIndex, term in enumerate(terms): #termIndex is the index of the term in the query
        if term not in index:
            continue
                    
        ## Compute tf*idf(normalize tf as done with documents)
        queryVector[termIndex] = query_terms_count[term]/query_norm * idf[term]

        # Generate docVectors for matching docs
        for docIndex, doc in enumerate(index[term]):    
            if doc in docs:
                docVectors[doc][termIndex] = tf[term][docIndex] * idf[term]

    # calculate the score of each doc
    # compute the cosine similarity between queyVector and each docVector:
    docScores = [[np.dot(curDocVec, queryVector), doc] for doc, curDocVec in docVectors.items()]
    docScores.sort(reverse = True)
    resultDocs = [x[1] for x in docScores]
    
    #print document titles instead of document id's
    #resultDocs = [ titleIndex[x] for x in resultDocs ]
    if len(resultDocs) == 0:
        print("No results found, try again")
        print("Query: ", query)
        #query = input()
        #docs = search_tf_idf(query, index)    
    
    return resultDocs

In [10]:
def search_tf_idf(query, index):
    '''
    Outputs the list of documents that contain all of the query terms. 
    
    Argument:
    query -- list of query terms
    index -- inverted index data structure
    
    Returns:
    Return the list of top 10 documents
    
    '''
    query = getTerms(query)
    docs = set()
    
    # Return docs that contain all query terms
    for term in query:
        try:
            # store in termDocs the ids of the docs that contain "term"                        
            termDocs = [posting for posting in index[term]]

            # No documents had been stored yet
            if len(docs) == 0:
                docs = docs.union(termDocs)
            
            # Store only documents that are in both sets, meaning that they contained previous terms and current term
            else:
                docs = docs.intersection(termDocs)

        except:
            #term is not in index
            pass
    docs = list(docs)
    ranked_docs = rankDocuments(query, docs, index, idf, tf)   
    return ranked_docs

## 4. RERANKING APPROACH

In [11]:
def compute_similarity(d1_words, d2_words):
    d1_words = set(d1_words.split())
    d2_words = set(d2_words.split())
    sim = len(d1_words.intersection(d2_words))/len(d1_words.union(d2_words))
    return sim

In [12]:
## Compute similarity between tweets
M = np.zeros((len(tweets), len(tweets)))
print(M.shape)

for i in range(0, len(tweets)):

    for j in range(i + 1, len(tweets)):
        sim = compute_similarity(tweets.loc[i, 'terms'], tweets.loc[j, 'terms'])
        M[i][j] = sim
        M[j][i] = sim

(10000, 10000)
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900


In [13]:
sums = M.sum(axis = 0, keepdims = 1); 
sums[sums == 0] = 1
M_norm = M/sums

**Compute topic richness**

The higher the score a tweet's neighbour obtains, the higher its score. Similar to PageRank.

In [14]:
c = 0.99
mat = c*M_norm + ((1 - c)/numDocuments)*np.ones((numDocuments, numDocuments))

# Topic Richness is eigenvector with highest eigenvalue
eigvals, eigvecs = la.eigh(mat)

In [15]:
idx = eigvals.argsort()[::-1]   
eigvals = eigvals[idx]
eigvecs = eigvecs[:,idx]

In [16]:
tweets['topic_richness'] = eigvecs[0]

**Ranking strategy**

In [67]:
import ast

def add_query_results(tweets, docs, query):
    i = 1
    
    result = pd.DataFrame(columns = ['Query', 'Result', 'Tweet', 'Id', 'User', 'Date', 'Hashtags', 'Cluster', 
                                     'TopicRichness'])
    
    for doc_id in docs:
        tweet = tweets[tweets['id'] == doc_id]
        tid = tweets[tweets['id'] == doc_id].index.item()
        user = tweet['user'].item()
        user = ast.literal_eval(user)
        entities = tweet['entities'].item()
        entities = ast.literal_eval(entities)
        hashtags = [hashtag['text'] for hashtag in entities['hashtags']]
        
        result = result.append({'Query': query, 'Result': i, 'Tweet': tweet['text'].item(), 'Id': tid, 
                                'User': user['screen_name'], 'Date': tweet['created_at'].item(), 'Hashtags': hashtags, 
                                'Cluster': tweet['cluster'].item(), 'TopicRichness': tweet['topic_richness'].item(),
                                'Words': tweet['terms'].item()}, 
                               ignore_index = True)
        i = i + 1
        
    return result

In [68]:
queries = ["test", "coronavirus", "realdonaldtrump"]
results_ranking = pd.DataFrame()
top = 100

for query in queries:
    ranked_docs = search_tf_idf(query, index)
    results_ranking = results_ranking.append(add_query_results(tweets, ranked_docs[:top], query), ignore_index = True)

In [69]:
results_ranking.head()

Unnamed: 0,Query,Result,Tweet,Id,User,Date,Hashtags,Cluster,TopicRichness,Words
0,test,1,"""As the @WHO's Dr. Tedros said very emphatical...",5710,manigreeva,Sun Nov 22 11:12:43 +0000 2020,[],0.0,9.5e-05,who dr tedro said emphat outset test test test...
1,test,2,Loeffler tests positive for Covid but undergoi...,9774,bote930,Sun Nov 22 11:26:46 +0000 2020,[],0.0,0.001846,loeffler test posit covid undergo test
2,test,3,@mel_faith1 The thing is is there is NO test f...,6685,susanlee52,Sun Nov 22 11:16:15 +0000 2020,[],0.0,-0.000322,mel_faith1 thing test covid test viru
3,test,4,Testing. Testing. Covid/TheDanes: 'They kept i...,3034,Fuerza_Mundial,Sun Nov 22 11:04:34 +0000 2020,[],0.0,0.000736,test test covidthedan kept month18 mask wearer...
4,test,5,Loeffler tests positive for Covid but undergoi...,472,DeeFonta,Sun Nov 22 10:56:01 +0000 2020,[],0.0,0.005545,loeffler test posit covid undergo test


**Reranking strategy**

In [70]:
def add_result(result, i):
    return {'Query': result['Query'], 'RankRes': result['Result'], 'RerankRes': i, 'Tweet': result['Tweet'], 
            'User': result['User'], 'Date': result['Date'], 'Hashtags': result['Hashtags'], 
            'Cluster': result['Cluster'], 'TopicRichness': result['TopicRichness'], 'Words': result['Words']}

In [71]:
def reranking(results_ranking, M, top):

    result = pd.DataFrame(columns = ['Query', 'RankRes', 'RerankRes', 'Tweet', 'User', 'Date', 'Hashtags', 'Cluster', 
                                     'TopicRichness', 'Words'])
    
    max_ = len(results_ranking) if len(results_ranking) < top else top
    for i in range(0, max_):
        # Get the result with highest Topic Richness
        results_ranking = results_ranking.sort_values(by = ["TopicRichness"], ascending = False)
        results_ranking = results_ranking.reset_index()
        results_ranking.drop(columns = ['index'], inplace = True)
        
        #print(results_ranking)
        d_id = results_ranking.loc[0, 'Id']
        tr = results_ranking.loc[0, 'TopicRichness']
        
        result = result.append(add_result(results_ranking.iloc[0], i + 1), ignore_index = True)
        #result = result.append(results_ranking.iloc[0])
        results_ranking.drop(0, inplace = True)
        
        # Recompute Topic Richness
        for j in range(1, len(results_ranking)):
            n_id = results_ranking.loc[j, 'Id']
            results_ranking.loc[j, 'TopicRichness'] = results_ranking.loc[j, 'TopicRichness'] - M[n_id][d_id]*tr
            
    return result

In [72]:
results_reranking = pd.DataFrame()

for query in results_ranking['Query'].unique():
    results = results_ranking[results_ranking['Query'] == query]

    results_reranking = results_reranking.append(reranking(results, M_norm, 20))

In [73]:
results_ranking = results_ranking[results_ranking['Result'].isin(range(1, 21))]

**Diversity score**

In [74]:
def compute_words(texts_):
    
    words_docs = {}
    for _, row in texts_.iteritems():
        for word in row.split():
            
            if word not in words_docs.keys():
                words_docs[word] = 0
            
            words_docs[word] += 1
            
    return words_docs

In [75]:
DS = {}

for query in results_reranking['Query'].unique():
    DS_query = 0
    results = results_reranking[results_reranking['Query'] == query]
    
    df_results = compute_words(results['Words'])
    for _, row in results.iterrows():
        inter = 0
        for word in row['Words'].split():
            inter += 1/df_results[word]
        if len(row['Words'].split()) != 0:
            DS_query += inter/len(row['Words'].split())
    DS[query] = DS_query

In [76]:
DS

{'test': 12.579391605126897,
 'coronavirus': 13.735857142857142,
 'realdonaldtrump': 15.072341269841267}

## 5. RANKING DIFFERENCE AND COVERAGE

In [77]:
from scipy.stats import spearmanr

for query in results_ranking['Query'].unique():
    results_rank = results_ranking[results_ranking['Query'] == query]['Result']
    results_rerank = results_reranking[results_reranking['Query'] == query]['RankRes']
    
    corr, p = spearmanr(results_rank, results_rerank)
    print(query, corr)

test 0.13984962406015036
coronavirus 0.018045112781954885
realdonaldtrump 0.03909774436090225


In [78]:
for query in results_ranking['Query'].unique():
    print(query)
    print('\t Ranking coverage: ' + str(results_ranking[results_ranking['Query'] == query]['Cluster'].nunique()))
    print('\t Re-ranking coverage: ' + str(results_reranking[results_reranking['Query'] == query]['Cluster'].nunique()))
    print()

test
	 Ranking coverage: 1
	 Re-ranking coverage: 2

coronavirus
	 Ranking coverage: 1
	 Re-ranking coverage: 2

realdonaldtrump
	 Ranking coverage: 2
	 Re-ranking coverage: 3



In [79]:
def print_results_info(results, rerank = True):
    
    for _, row in results.iterrows():
        if rerank:
            print(str(row['RerankRes']) + '. \tTweet: ' + row['Tweet'] + '\t User: ' + row['User'] + 
                  '\t Date: ' + row['Date'] + '\t Hashtags: ' + str(row['Hashtags']) + '\t Cluster: ' + 
                  str(row['Cluster']) + '\t TopicRichness ' + str(row['TopicRichness']))
        else:
            print(str(row['Result']) + '. \tTweet: ' + row['Tweet'] + '\t User: ' + row['User'] + 
                  '\t Date: ' + row['Date'] + '\t Hashtags: ' + str(row['Hashtags']) + '\t Cluster: ' + 
                  str(row['Cluster']) + '\t TopicRichness ' + str(row['TopicRichness']))
            

In [80]:
# Ranking Results for test
print_results_info(results_ranking[results_ranking['Query'] == "test"], rerank = False)

1. 	Tweet: "As the @WHO's Dr. Tedros said very emphatically at the very outset: “Test, test, test”. As a physician, I wish he’… https://t.co/n3Dn2NwnlE	 User: manigreeva	 Date: Sun Nov 22 11:12:43 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 9.512778224714583e-05
2. 	Tweet: Loeffler tests positive for Covid but undergoing further testing
https://t.co/5ulPDNmhne	 User: bote930	 Date: Sun Nov 22 11:26:46 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.0018463648458519586
3. 	Tweet: @mel_faith1 The thing is is there is NO test for Covid! The test is for a virus only	 User: susanlee52	 Date: Sun Nov 22 11:16:15 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness -0.00032220929354842965
4. 	Tweet: Testing. Testing. Covid/TheDanes: 'They kept it up for a month.1.8% of mask wearers tested positive &amp; 2.1% of the u… https://t.co/Oc5tGJDuuH	 User: Fuerza_Mundial	 Date: Sun Nov 22 11:04:34 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.0007356799139667318
5. 	Tweet

In [81]:
# Reranking Results for test
print_results_info(results_reranking[results_reranking['Query'] == "test"], rerank = True)

1. 	Tweet: Loeffler tests positive for Covid but undergoing further testing
https://t.co/c06EbCUbZA	 User: DeeFonta	 Date: Sun Nov 22 10:56:01 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.005544894615846776
2. 	Tweet: Jesus H Christ it's worse than I thought! Never mind a covid test, this guy needs to have an IQ test twice a week.	 User: PrimLikeARose	 Date: Sun Nov 22 10:54:42 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.005485703194656347
3. 	Tweet: Breaking: Loeffler self-isolating after mixed coronavirus test results https://t.co/x2YfWTfJb5	 User: JClickbaited	 Date: Sun Nov 22 10:57:43 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.0023553949569966384
4. 	Tweet: Loeffler tests positive for Covid but undergoing further testing
https://t.co/5ulPDNmhne	 User: bote930	 Date: Sun Nov 22 11:26:46 +0000 2020	 Hashtags: []	 Cluster: 0.0	 TopicRichness 0.0018099505249776567
5. 	Tweet: COVID testing workers needed for mass departure testing programme.
Find out