In [83]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import collections
%matplotlib inline

#Getting the data
con = sqlite3.connect('tweetsSpring.db')
data = pd.read_sql("SELECT * from tweets",con)

In [85]:
#Progress
from sys import stdout
def progress(i, n):
    stdout.write("\r%f%%" % (i*100/float(n)))
    stdout.flush()
    if i == n-1:
        stdout.write("\r100%")
        print("\r\n")

In [86]:
import json
import re
import pymorphy2
import Stemmer

class TweetTextParser():

    def __init__(self):
        print 'Invoking Processor...'
        self.morph = pymorphy2.MorphAnalyzer()
        self.stemmer = Stemmer.Stemmer('russian')
        try:
            self.emo_db = json.load(open('Hydra/pyalchemy/emoji_database','r'))
        except:
            print('No emoji database found')


    def processContents(self, myText):
        myText = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?¬´¬ª‚Äú‚Äù‚Äò‚Äô]))', '', myText)
        words = [word for word in re.findall(r'(?u)[@|#]?\w+', myText) if not word.startswith(('@','#'))]
        words = self.stemmer.stemWords(words)
        return words
        
    def resolveEmoji(self, myText):
        emostr = []
        emo_db = self.emo_db
        b = myText.encode('unicode_escape').split('\\')
        c = [point.replace('000','+').upper() for point in b if len(point) > 8 and point[0] == 'U']
        [emostr.append(emo_db[emo[:7]]) for emo in c if emo[:7] in emo_db]
        return myText

textProcessor = TweetTextParser()

Invoking Processor...


In [86]:
from time import time
morph = pymorphy2.MorphAnalyzer()
stemmer = Stemmer.Stemmer('russian')

#Cleaning up the data
t0 = time()
terms = []
n = len(data.index)
#n = 100000
for i in range(n):
    terms.append(textProcessor.processContents(data.content_lower[i]))
print time()-t0

149.473669052


In [88]:
data['terms'] = terms[:]

In [89]:
for row in data.index[:10]: 
    for word in data['terms'][row]: print word

Well
it
was
–°–£–ö–ê
My
universe
will
never
be
the
same
I
m
glad
you
came
–í–µ—Ç–æ—à–Ω
–ø–µ—Ä–µ—É–ª–æ–∫
—Ö–æ—á
–ø–µ—Ä–µ—Å–º–æ—Ç—Ä–µ—Ç
–≤—Å–µ
—á–∞—Å—Ç
–ì–∞—Ä
–ü–æ—Ç—Ç–µ—Ä
I
m
at
–ê–≤—Ç–æ–±—É—Å
293
—Ç–∞–∫
–∏
–∑–Ω–∞–ª
—á—Ç–æ
–≠–ª—å–î–∂–æ
–ø–æ–∫—Ä–∞—Å
–≤
—Å–≤–µ—Ç–ª
–µ—â—ë
–≤
–ø—Ä–æ—à–ª
—Ä–∞–∑
—Ä–∞–∑–≥–ª—è–¥–µ–ª
–∫–æ–Ω—á–∏–∫
–Ω–æ
–±–ª–∏–Ω
–≠–ª—å–î–∂–æ
–∑–∞–∫–∞–Ω—á–∏–≤–∞
–∫–æ—Å
–ø–æ–¥
–ö–∏–±–æ–º
—Å–ø–∞—Å–∏–±
–±—ã
—Å–∫–∞–∑–∞
—Å–≤–∏–Ω–æ—Ç
–û–ù
–û–ü–Ø–¢–¨
–î–†–ï–õ–ò–¢
–û–∫—É–ª–∏—Å—Ç
—Ñ–∏–≥
–ô–û–£


In [92]:
from collections import Counter
import nltk

words = []
for i in range(n):
    words += terms[i]

fdist = nltk.FreqDist(words)
sortedDist = sorted(fdist.items(), key=operator.itemgetter(1),reverse=True)

#sortedDist = [x for x in sortedDist if len(x[0]) > 2]
#allTheWords = [x for x in moreThan3 if x[1] > 1]
interestingVocab = [x[0] for x in sortedDist]
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  420982


In [93]:
for i in range(10):
    print interestingVocab[i]

–≤
–∏
–Ω–µ
—è
–Ω–∞
—á—Ç–æ
—Å
—ç—Ç
—Ç–∞–∫
–∫–∞–∫


In [94]:
#Find TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

trainingList = []
for i in range(n):
    trainingList.append(' '.join(data['terms'][i]))

tfidf_vectorizer = TfidfVectorizer(vocabulary = interestingVocab)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(trainingList)  #finds the tfidf score with normalization

In [95]:
import time
def findSimilarTweets(queryTweet, data):
    print 'Query:', queryTweet
    processedTweet = ' '.join(textProcessor.processContents(queryTweet))
    
    queryTweetRepresentation = tfidf_vectorizer.transform([processedTweet])
    start_time = time.time()
    cosine_similarities = cosine_similarity(queryTweetRepresentation, tfidf_matrix_train)[0]
    indices = cosine_similarities.argsort()[:-10:-1]
    elapsed_time = time.time() - start_time
    print 'time: ', elapsed_time
    
    print 'cosine scores ==>', cosine_similarities[indices]
    print ''
    print 'Results:'
    print data[indices]
    
    return indices

In [99]:
#before
inidiciesOfTweets = findSimilarTweets(u'–≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç —Å—Ç–∞–Ω—Ü–∏–∏',data['content'])

Query: –≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç —Å—Ç–∞–Ω—Ü–∏–∏
time:  0.81738615036
cosine scores ==> [ 0.80963556  0.76302223  0.76099306  0.75237399  0.74935803  0.73633315
  0.72306345  0.72114582  0.70909817]

Results:
496738                           @TimZhur –≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç
850138    –ê –≤ –ú–æ—Å–∫–æ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è—é—Ç –∑–Ω–∞–º–µ–Ω–∏...
618510               –í –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è–µ—Ç –°—é—Ç–∫–∏–Ωüòèüöáüì£
245042    #–±—Å—Å—Ä #–±–µ–ª–∞—Ä—É—Å—å #—Å—Å—Å—Ä #–º–µ—Ç—Ä–æ–ø–æ–ª–∏—Ç–µ–Ω #–ú–æ—Å–∫–≤–∞ #–º...
573446    –ú–∏—Ö–∞–ª–∫–æ–≤ –≤ –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è–µ—Ç. –ú–µ—Ç—Ä–æ –º–æ–µ–π...
586347    –ß—Ç–æ –∑–∞ —Å—Ç—Ä–∞–Ω–Ω—ã–µ –≥–æ–ª–æ—Å–∞ —Ç–µ–ø–µ—Ä—å –æ–±—ä—è–≤–ª—è—é—Ç  —Å—Ç–∞–Ω—Ü...
689794         –•–æ—á—É —á—Ç–æ–±—ã –≤ –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è–ª –ø–∞—Ç—Ä–∏–∞—Ä—Ö
745157    –Ø —Ö–æ—á—É, —á—Ç–æ–±—ã —Å—Ç–∞–Ω—Ü–∏–∏ –≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è–ª –î–∂–µ–π–º—Å ...
554644          –í –º–µ—Ç—Ä–

In [None]:
len(data)

In [74]:
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  15268


In [75]:
print data['cleanText'][0]

–†–µ–±—è—Ç–∞, –º—ã –≤—Å–µ–º —Ä–∞–¥—ã!


In [53]:
len(trainigList)

673685

In [44]:
from sparselsh import LSH
from scipy.sparse import csr_matrix

help(LSH)

Help on class LSH in module sparselsh.lsh:

class LSH(__builtin__.object)
 |  LSH implments locality sensitive hashing using random projection for
 |  input vectors of dimension `input_dim`.
 |  
 |  Attributes:
 |  
 |  :param hash_size:
 |      The length of the resulting binary hash in integer. E.g., 32 means the
 |      resulting binary hash will be 32-bit long.
 |  :param input_dim:
 |      The dimension of the input vector. This can be found in your sparse
 |      matrix by checking the .shape attribute of your matrix. I.E.,
 |          `csr_dataset.shape[1]`
 |  :param num_hashtables:
 |      (optional) The number of hash tables used for multiple look-ups.
 |      Increasing the number of hashtables increases the probability of
 |      a hash collision of similar documents, but it also increases the
 |      amount of work needed to add points.
 |  :param storage_config:
 |      (optional) A dictionary of the form `{backend_name: config}` where
 |      `backend_name` is the eithe

In [45]:
from sparselsh import LSH
from scipy.sparse import csr_matrix

t0 = time()
lsh = LSH( 100,
           tfidf_matrix_train.shape[1],
           num_hashtables=1,
           storage_config={"dict":None})
print time()-t0

NameError: name 'tfidf_matrix_train' is not defined

In [None]:
for ix in xrange(tfidf_matrix_train.shape[0]):
    x = tfidf_matrix_train.getrow(ix)
    lsh.index(x)
    progress(ix,tfidf_matrix_train.shape[0])

6.470348%