In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import collections
%matplotlib inline

#Getting the data
con = sqlite3.connect('tweetsSpring.db')
data = pd.read_sql("SELECT * from tweets",con)

In [2]:
#Progress
from sys import stdout
def progress(i, n):
    stdout.write("\r%f%%" % (i*100/float(n)))
    stdout.flush()
    if i == n-1:
        stdout.write("\r100%")
        print("\r\n")

In [4]:
import json
import re
import pymorphy2
import Stemmer

class TweetTextParser():

    def __init__(self):
        print 'Invoking Processor...'
        self.morph = pymorphy2.MorphAnalyzer()
        self.stemmer = Stemmer.Stemmer('russian')
        try:
            self.emo_db = json.load(open('emoji_database','r'))
        except:
            print('No emoji database found')


    def processContents(self, myText):
        #myText = self.resolveEmoji(myText)
        #URLless_txt = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?¬´¬ª‚Äú‚Äù‚Äò‚Äô]))', '', myText)
        myText = re.sub('[,|.|?|!]',' ',myText)
        return [word for word in self.stemmer.stemWords(myText.split(' ')) if (not word.startswith(('@','http','co/','#'))) & (word != '')]

    def resolveEmoji(self, myText):
        emostr = []
        emo_db = self.emo_db
        b = myText.encode('unicode_escape').split('\\')
        c = [point.replace('000','+').upper() for point in b if len(point) > 8 and point[0] == 'U']
        [emostr.append(emo_db[emo[:7]]) for emo in c if emo[:7] in emo_db]
        return myText + ' ' +' '.join(emostr)

textProcessor = TweetTextParser()

Invoking Processor...


In [None]:
from time import time
morph = pymorphy2.MorphAnalyzer()
stemmer = Stemmer.Stemmer('russian')


#Cleaning up the data
t0 = time()
terms = []
n = len(data.index)
#n = 100000
for i in range(n):
    terms.append(textProcessor.processContents(data.content_lower[i]),)
print time()-t0

In [90]:
data['terms'] = terms[:]

In [91]:
for row in data.index[:10]: 
    for word in data['terms'][row]: print word

Well
it
was
–°–£–ö–ê
My
universe
will
never
be
the
same
I'm
glad
you
came
‚ú®üåå
–í–µ—Ç–æ—à–Ω
–ø–µ—Ä–µ—É–ª–æ–∫
—Ö–æ—á
–ø–µ—Ä–µ—Å–º–æ—Ç—Ä–µ—Ç
–≤—Å–µ
—á–∞—Å—Ç
–ì–∞—Ä
–ü–æ—Ç—Ç–µ—Ä
I'm
at
–ê–≤—Ç–æ–±—É—Å
293
—Ç–∞–∫
–∏
–∑–Ω–∞–ª
—á—Ç–æ
–≠–ª—å–î–∂–æ
–ø–æ–∫—Ä–∞—Å
–≤
—Å–≤–µ—Ç–ª
–µ—â—ë
–≤
–ø—Ä–æ—à–ª
—Ä–∞–∑
—Ä–∞–∑–≥–ª—è–¥–µ–ª
–∫–æ–Ω—á–∏–∫
–Ω–æ
–±–ª–∏–Ω
–≠–ª—å–î–∂–æ
–∑–∞–∫–∞–Ω—á–∏–≤–∞
–∫–æ—Å
–ø–æ–¥
–ö–∏–±–æ–º
—Å–ø–∞—Å–∏–±
–±—ã
—Å–∫–∞–∑–∞
—Å–≤–∏–Ω–æ—Ç
–û–ù
–û–ü–Ø–¢–¨
–î–†–ï–õ–ò–¢
–û–∫—É–ª–∏—Å—Ç
—Ñ–∏–≥
(^‚óá^;)
–ô–û–£‚òÄÔ∏è


In [134]:
from collections import Counter
import nltk

words = []
for i in range(n):
    words += terms[i]

fdist = nltk.FreqDist(words)
sortedDist = sorted(fdist.items(), key=operator.itemgetter(1),reverse=True)

#moreThan3 = [x for x in sortedDist if len(x[0]) > 3]
#allTheWords = [x for x in moreThan3 if x[1] > 1]
interestingVocab = [x[0] for x in sortedDist]
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  813042


In [135]:
for i in range(10):
    print interestingVocab[i]

–≤
–∏
–Ω–µ
—è
–Ω–∞
—á—Ç–æ
—ç—Ç
—Å
—Ç–∞–∫
in


In [136]:
#Find TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

trainingList = []
for i in range(n):
    trainingList.append(' '.join(data['terms'][i]))

tfidf_vectorizer = TfidfVectorizer(vocabulary = interestingVocab)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(trainingList)  #finds the tfidf score with normalization

In [137]:
import time
def findSimilarTweets(queryTweet, data):
    print 'Query:', queryTweet
    processedTweet = ' '.join(textProcessor.processContents(queryTweet))
    
    queryTweetRepresentation = tfidf_vectorizer.transform([processedTweet])
    start_time = time.time()
    cosine_similarities = cosine_similarity(queryTweetRepresentation, tfidf_matrix_train)[0]
    indices = cosine_similarities.argsort()[:-10:-1]
    elapsed_time = time.time() - start_time
    print 'time: ', elapsed_time
    
    print 'cosine scores ==>', cosine_similarities[indices]
    print ''
    print 'Results:'
    print data[indices]
    
    return indices

In [200]:
#before
inidiciesOfTweets = findSimilarTweets('–≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç —Å—Ç–∞–Ω—Ü–∏–∏',data['content'])

Query: –≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç —Å—Ç–∞–Ω—Ü–∏–∏
time:  0.799732923508
cosine scores ==> [ 0.80935316  0.76365077  0.76228469  0.75323305  0.75323305  0.75323305
  0.74776796  0.7362765   0.7246192 ]

Results:
496738                           @TimZhur –≤ –º–µ—Ç—Ä–æ –æ–±—ä—è–≤–ª—è—é—Ç
850138    –ê –≤ –ú–æ—Å–∫–æ–≤—Å–∫–æ–º –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è—é—Ç –∑–Ω–∞–º–µ–Ω–∏...
618510               –í –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è–µ—Ç –°—é—Ç–∫–∏–Ωüòèüöáüì£
851352    üôà‚òÄÔ∏èüéâüá∑üá∫üòâ @ –°—Ç–∞–Ω—Ü–∏—è –º–µ—Ç—Ä–æ "–ü—É—à–∫–∏–Ω—Å–∫–∞—è" http...
851218    üéâüéâüéâ @ –°—Ç–∞–Ω—Ü–∏—è –º–µ—Ç—Ä–æ "–ü—É—à–∫–∏–Ω—Å–∫–∞—è" https://t....
742014    #–Ø–ß–æ–∫–Ω—É—Ç–∞—è–ü—Ç–∏—á–∫–∞ üòÜüòÜüòÜüôàüôâüôä @ –°—Ç–∞–Ω—Ü–∏—è –ú–µ—Ç—Ä–æ ...
573446    –ú–∏—Ö–∞–ª–∫–æ–≤ –≤ –º–µ—Ç—Ä–æ —Å—Ç–∞–Ω—Ü–∏–∏ –æ–±—ä—è–≤–ª—è–µ—Ç. –ú–µ—Ç—Ä–æ –º–æ–µ–π...
586347    –ß—Ç–æ –∑–∞ —Å—Ç—Ä–∞–Ω–Ω—ã–µ –≥–æ–ª–æ—Å–∞ —Ç–µ–ø–µ—Ä—å –æ–±—ä—è–≤–ª—è—é—Ç  —Å—Ç–∞–Ω—Ü...
689794         –•–æ—á—É —á—Ç–æ–±—ã

In [None]:
len(data)

In [74]:
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  15268


In [75]:
print data['cleanText'][0]

–†–µ–±—è—Ç–∞, –º—ã –≤—Å–µ–º —Ä–∞–¥—ã!


In [53]:
len(trainigList)

673685

In [76]:
from sparselsh import LSH
from scipy.sparse import csr_matrix
lsh = LSH( 100,
           tfidf_matrix_train.shape[1],
           num_hashtables=1,
           storage_config={"dict":None})

In [None]:
for ix in xrange(tfidf_matrix_train.shape[0]):
    x = tfidf_matrix_train.getrow(ix)
    lsh.index(x)
    progress(ix,tfidf_matrix_train.shape[0])

6.470348%