In [6]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import collections
%matplotlib inline

#Getting the data
con = sqlite3.connect('tweetsSpring.db')
data = pd.read_sql("SELECT * from tweets where month = 5 and day != 26",con)
dataLast = pd.read_sql("SELECT * from tweets where month = 5 and day = 26",con)

In [11]:
#Progress
from sys import stdout
def progress(i, n):
    stdout.write("\r%f%%" % (i*100/float(n)))
    stdout.flush()
    if i == n-1:
        stdout.write("\r100%")
        print("\r\n")

In [13]:
import json
import re
import pymorphy2
import Stemmer

class TweetTextParser():

    def __init__(self):
        print 'Invoking Processor...'
        self.morph = pymorphy2.MorphAnalyzer()
        self.stemmer = Stemmer.Stemmer('russian')
        try:
            self.emo_db = json.load(open('pyalchemy/emoji_database','r'))
        except:
            print('No emoji database found')


    def processContents(self, myText):
        myText = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', myText)
        words = [word for word in re.findall(r'(?u)[@|#]?\w+', myText) if not word.startswith(('@','#'))]
        words = self.stemmer.stemWords(words)
        return words
        
    def resolveEmoji(self, myText):
        emostr = []
        emo_db = self.emo_db
        b = myText.encode('unicode_escape').split('\\')
        c = [point.replace('000','+').upper() for point in b if len(point) > 8 and point[0] == 'U']
        [emostr.append(emo_db[emo[:7]]) for emo in c if emo[:7] in emo_db]
        return myText

textProcessor = TweetTextParser()

Invoking Processor...


In [14]:
from time import time
morph = pymorphy2.MorphAnalyzer()
stemmer = Stemmer.Stemmer('russian')

#Cleaning up the data
t0 = time()
terms = []
n = len(data.index)
#n = 100000
for i in range(n):
    terms.append(textProcessor.processContents(data.content_lower[i]))
print time()-t0

31.3677601814


In [15]:
data['terms'] = terms[:]

In [16]:
for row in data.index[:10]: 
    for word in data['terms'][row]: print word

Ну
кто
ещ
попадет
на
инвентаризац
есл
не
я
пожалуйст
пуст
утр
все
эт
окажет
дурн
сном
Смотр
я
в
Москв
Сейчас
9
ясн
Утром
7
облачн
небольш
дожд
Днем
11
пасмурн
дожд
Вечер
Аа
ты
в
Москв
Немн
приятн
момент
Ну
взяв
тво
ручк
в
сво
несильн
сжима
да
да
я
в
Москв
Росс
ахаххах
аргументн
аргумент


In [22]:
from collections import Counter
import nltk

words = []
for i in range(n):
    words += terms[i]

fdist = nltk.FreqDist(words)
sortedDist = sorted(fdist.items(), key=operator.itemgetter(1),reverse=True)

sortedDist = [x for x in sortedDist if len(x[0]) > 2]
#allTheWords = [x for x in moreThan3 if x[1] > 1]
interestingVocab = [x[0] for x in sortedDist]
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  262391


In [23]:
for i in range(10):
    print interestingVocab[i]

что
так
как
Москв
мен
все
мне
был
теб
прост


In [24]:
#Find TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

trainingList = []
for i in range(n):
    trainingList.append(' '.join(data['terms'][i]))

tfidf_vectorizer = TfidfVectorizer(vocabulary = interestingVocab)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(trainingList)  #finds the tfidf score with normalization

In [137]:
import time
def findSimilarTweets(queryTweet, data, threshold, maxNumber = 0, log = True):
    print 'Query:', queryTweet
    
    processedTweet = ' '.join(textProcessor.processContents(queryTweet))
    queryTweetRepresentation = tfidf_vectorizer.transform([processedTweet])
    
    start_time = time.time()
    cosine_similarities = cosine_similarity(queryTweetRepresentation, tfidf_matrix_train)[0]
    totalMatchingTweets = len(cosine_similarities[cosine_similarities>threshold])
    if maxNumber:
        totalMatchingTweets = min(totalMatchingTweets, maxNumber)
    indices = cosine_similarities.argsort()[::-1][:totalMatchingTweets]
    elapsed_time = time.time() - start_time
    if (log):
        print 'time: ', elapsed_time
        print ''
        print 'Results:'
        print data[indices]
        print ''
        print 'cosine scores ==>', cosine_similarities[indices]
    
    return indices

In [139]:
#before
inidiciesOfTweets = findSimilarTweets(u'в метро объявляют станции', data['content'], 0.5, log = False)

Query: в метро объявляют станции


In [133]:
data.created_at[inidiciesOfTweets]

208048    2015-05-09 11:24:42
82540     2015-05-04 14:26:25
47704     2015-05-03 04:36:03
103067    2015-05-05 09:15:06
51625     2015-05-03 10:02:38
508327    2015-05-20 15:41:42
30541     2015-05-02 11:12:48
222745    2015-05-09 20:55:48
107291    2015-05-05 12:40:51
111956    2015-05-05 15:51:45
Name: created_at, dtype: object

In [74]:
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  15268


In [75]:
print data['cleanText'][0]

Ребята, мы всем рады!


In [53]:
len(trainigList)

673685

In [2]:
from scipy.sparse import csr_matrix

print('hi')

hi


In [45]:
from sparselsh import LSH
from scipy.sparse import csr_matrix

t0 = time()
lsh = LSH( 100,
           tfidf_matrix_train.shape[1],
           num_hashtables=1,
           storage_config={"dict":None})
print time()-t0

NameError: name 'tfidf_matrix_train' is not defined

In [None]:
for ix in xrange(tfidf_matrix_train.shape[0]):
    x = tfidf_matrix_train.getrow(ix)
    lsh.index(x)
    progress(ix,tfidf_matrix_train.shape[0])

6.470348%

In [None]:
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections

# Dimension of our vector space
dimension = 478524

# Create a random binary hash with 10 bits
rbp = RandomBinaryProjections('rbp', 1000)

# Create engine with pipeline configuration
engine = Engine(dimension, lshashes=[rbp])

# Index 1000000 random vectors (set their data to a unique string)
for index in range(673685):
    v = tfidf_matrix_train[index:index+1]
    engine.store_vector(v, 'data_%d' % index)
    progress(index,673685)

# Create random query vector
query = numpy.random.randn(dimension)

# Get nearest neighbours
N = engine.neighbours(query)