# uSearch 
This search engine will allow inquiring minds to enter a query that will be matched against our Twitter corpus. This will also feature a query option of allowing the user to search for tweets from a specified user. 

# Imports
All needed libraries for this search engine to be functional.

In [24]:
import re
import csv
import math
import collections
import pandas as pd
import bokeh

# Erase Link
This function is part of the normalization effort. A full length tweet will be inserted, and a tweet without a link will be output. 

In [10]:
def erase_link(tweet):
    inx = tweet.find('http')
    #Check to see if tweet contains a link
    if inx != -1:
        #Find white space
        sp_inx = tweet.find(" ", inx)
        #Check if there is a white space after the link
        if sp_inx != -1:
            new_tweet = tweet[0:inx] + tweet[sp_inx + 1:]
        else:
            new_tweet = tweet[0:inx]
        return new_tweet
    return tweet

# Normalize
This function will normalize a tweet passed in. The output will be the tweet without any special characters, except a '@,' in all lowercase.

In [11]:
def normalize(tweet):
    #erase link
    normalizedTweet = erase_link(tweet)
    #print normalizedTweet
    normalizedTweet = re.compile('[^a-zA-Z@]').sub(' ', normalizedTweet)
    normalizedTweet = normalizedTweet.lower()
    normalizedTweet = " ".join(normalizedTweet.split())
    return normalizedTweet


This portion of the program will open a csv file containing all tweets that will comprise our corpus. Three lists will be created: one for the dates, one for the usernames, and one for the tweets. These three lists are parallel to each other.

In [12]:
with open("twitter-data/testdata.manual.2009.06.14.csv", 'rb') as csvfile:
    r = csv.reader(csvfile, delimiter=',', quotechar='"')
    docs = [ (x[2], x[4], x[5])  for x in r]
    
tweet_date = [ d[0] for d in docs ] #Tweet dates
tweet_user = [ d[1] for d in docs ] #Tweet users
tweet_text = [ d[2] for d in docs ] #Tweet text

#Normalize each tweet 
tweet_text = map(normalize, tweet_text)


# Create Corpus 
This function will generate a corpus as a dictionary. The key value is a the document id, while the value is the date, username, and text

In [13]:
def create_corpus():
    corpus = {}
    for x in range(0,len(tweet_text)):
        corpus[x] = [tweet_date[x], tweet_user[x],tweet_text[x]]
        #print corpus[x]



# Inverted Indexs

For the twitter data, there will be two inverted indexes. The first inverted index will have the username as the key and the tweets that correspond to that username as the values. The second inverted index will be the word index for our tweets; the key will be a word and the values will be the id of the documents that contain that word.

# Word Dictionary Reversed Index

The reversed index for the words in the corpus will be created by going through each word in each document. The reveresed index will allow for a faster retrieval of relevant documents. 

In [14]:
def create_inverted_index(corpus):
    idx = {}
    
    for i, doc in enumerate(corpus):
        # Iterate through each word in the document
        for word in doc.split():
            if word in idx:
                # Update the document's term frequency
                if i in idx[word]:
                    idx[word][i] += 1
                # Add the document to the word index
                else:
                    idx[word][i] = 1;
            # Add the word to the reversed index
            else:
                idx[word] = {i:1}
    
    return idx


In [15]:
'''
test_users = ["vcu451", "chadfu", "SIX15"]
test_corpus = ["reading my kindle2  love it lee childs is good read", 
               "ok, first assesment of the kindle2 ...it fucking rocks", 
               "fuck this economy I hate aig and their non loan given asses"]
'''

idx = create_inverted_index(tweet_text)


# User Tweet Index

The user tweet index will allow for the all tweets(documents) that belong to a username to be retrieved quickly.

In [16]:
def create_user_index(users):
    
    user_tweets = {}
    
    # Go through each of the tweets in the corpus
    for i in range( len(users) ):
        # When user already exists, add the document id to the existing user tweet list
        if users[i] in user_tweets:
            user_tweets[users[i] ].append(i)
        # Otherwise, creat a new list with the document id
        else:
            user_tweets[users[i] ] = [i]
            
    return user_tweets

In [17]:
user_tweet_index = create_user_index(tweet_user)

#user_tweet_index

# Document Ranking
To rank the documents, we will be implementing two different ranking algorithms: TF-IDF and BM25. 

# TF-IDF
The TF-IDF (term frequency–inverse document frequency) ranking algorithm ranks documents based on the term frequency of the words in the query in relation to the words in the documents.

In [18]:
def print_results(results, n, head=True):
    ''' Helper function to print results
    '''
    if head:
        print('\nTop %d from recall set of %d items:' % (n, len(results) ) )
        for r in results[:n]:
            print('\t%0.2f - %s' % (r[0], r[2],tweet_text[r[1]]))
    else:
        print('\nTop %d from recall set of %d items:' % (n, len(results) ) )
        for r in results[:n]:
            print('\t%0.2f - %s' % (r[0],r[2], tweet_text[r[1]]))

In [19]:
def idf(term, idx, n):
    # term - the term that is being scored
    # idx - the reversed index on the terms in the corpus
    # n - the number of docments the term appears in
    return math.log(float(n) / (1 + len(idx[term])))
    
print(idf('how', idx, len(tweet_user)))
print(idf('hate', idx, len(tweet_user)))
print(idf('sleep', idx, len(tweet_user)))
print(idf('whoopi', idx, len(tweet_user)))
print(idf('monkeys', idx, len(tweet_user)))

3.81270480423
3.32022831913
4.60116216459
5.51745289646
5.51745289646


In [21]:
def get_results_tfidf(qry, idx, n):
    score = collections.Counter()
    for term in qry.split():
        if term in idx:
            i = idf(term, idx, n)
            for doc in idx[term]:
                score[doc] += idx[term][doc] * i
    results=[]
    for x in [[r[0],r[1]] for r in zip(score.keys(), score.values())]:
        if x[1] > 0:
            results.append([x[1],x[0]])
    sorted_results= sorted(results, key=lambda t : t[0] * -1)
    return sorted_results
#results = get_results_tfidf('monkeys', idx, len(tweet_user))
#results = get_results_tfidf('hate', idx, len(tweet_user))
#results = get_results_tfidf('sleep', idx, len(tweet_user))
results = get_results_tfidf('hate', idx, len(tweet_user))

#print_results(results, 10)

# BM25
Implement the BM25 ranking algorithm to rank our results. This ranking algorithm is the one we will be using for the final version of our search engine.

In [26]:
def get_results_bm25(qry, corpus, k1=1.5, b=0.75):
    idx = create_inverted_index(corpus)
    
    # n - the length of the corpus
    n = len(corpus)
    
    # d - list with elements corresponding to the length of each document
    d = [len(x.split()) for x in corpus]
    
    # d_avg - the average document length of the docuemnts in the corpus
    d_avg = float(sum(d) / len(d))
    score = collections.Counter()
    for term in qry.split():
        if term in idx:
            i = idf(term, idx, n)
            for doc in idx[term]:
                # f - the number of times the term appears in the document
                f = float(idx[term][doc])
                # s - the BM25 score for this (term, docuemnt) pair
                s = i * ( (f * (k1 + 1) ) / (f + k1 * (1 - b + (b * (float(d[doc] ) / d_avg) ) ) ) )
                score[doc] += s
                
    results = []
    for x in [ [r[0], r[1], tweet_user[r[0]] ] for r in zip(score.keys(), score.values() )]:
        if x[1] > 0:
            results.append([ x[1], x[0], x[2]])
            
    sorted_results = sorted(results, key=lambda t: t[0] * -1)
    return sorted_results

results = get_results_bm25('hate', tweet_text, k1=1.5, b=0.75)
results
#print_results(results, 25)

[[4.884069946101312, 494, 'sam33r'],
 [4.591805122198973, 50, 'Trazor1'],
 [4.591805122198973, 406, 'cassieeeelove'],
 [4.382027223215264, 272, 'Timl9068'],
 [4.190579431909742, 408, 'b_bassi'],
 [4.015159827783288, 409, 'cmg11'],
 [3.7049758067528193, 93, 'jonwolpert'],
 [3.567187450303335, 105, 'jos897'],
 [3.4392803305713424, 6, 'Seth937'],
 [3.0078723448550764, 422, 'fugface85'],
 [2.830358567125925, 296, 'MrsGinobili'],
 [2.7492336400426973, 143, 'NDEddieMac'],
 [2.672629606728814, 97, 'enriquenieto'],
 [2.4046221809844206, 210, 'davepurcell'],
 [2.2898126338817164, 149, 'JasonNegron'],
 [2.1854667417048277, 170, 'CWilliams_Rltr'],
 [2.136780601419324, 162, 'sardonnica']]

# Visualize the Effectiveness of BM25

In [29]:
from bokeh.plotting import output_notebook, show
from bokeh.charts import Scatter

In [58]:
results = get_results_bm25('i hate tomatos', tweet_text, k1=5.0, b=0.0)

# Plot score vs item length
df = pd.DataFrame({'score':[float(x[0]) for x in results],
                   'length':[len(tweet_text[x[1]].split()) for x in results]})
output_notebook()
p = Scatter(df, x='score', y='length')
show(p)

<bokeh.io._CommsHandle at 0x7f6df5bf09d0>

# Get content based on username
This function will retrieve tweets in the corpus that belong to the specified user. These tweets will be ordered by date. 

In [None]:
def print_results_username(username_index, user_name):
    if(user_name in username_index):
        # All of the documents that correspond to the username 
        docs = username_index[user_name]
        # For every doc, print date, username, and tweet 
        for doc in docs:
            print tweet_date[doc] + " " + tweet_user[doc] + " " + tweet_text[doc]
    else:
        print "Username does not exist"
    
results = create_user_index(tweet_user)
print_results_username(results, 'SimpleManJess')

# Determine User Query
This will take in the user query and determine whether or not the user is wishin to search all tweet content or a specific user's content.

In [None]:
def determine_query(query):
    if(query[0] == '@' and len(query.split()) == 1):
        user_choice = raw_input(("Select your query preference",
                                "\n1. All content",
                                "\n2. User content"))
        return user_choice
    return '1'
    

In [None]:
def u_search(user_choice, query):
    if(user_choice == '1'):
        results = get_results_bm25(query, tweet_text)
        print_results(results,25)
    if(user_choice == '2'):
        results = create_user_index(tweet_user)
        print_results_username(results, query)
    

In [None]:
query = normalize("@kirstiealley")
#choice = determine_query(query)
u_search('1',query)