In [6]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
import collections
%matplotlib inline

#Getting the data
con = sqlite3.connect('tweetsSpring.db')
data = pd.read_sql("SELECT * from tweets where month = 5 and day != 26",con)
dataLast = pd.read_sql("SELECT * from tweets where month = 5 and day = 26",con)

In [11]:
#Progress
from sys import stdout
def progress(i, n):
    stdout.write("\r%f%%" % (i*100/float(n)))
    stdout.flush()
    if i == n-1:
        stdout.write("\r100%")
        print("\r\n")

In [13]:
import json
import re
import pymorphy2
import Stemmer

class TweetTextParser():

    def __init__(self):
        print 'Invoking Processor...'
        self.morph = pymorphy2.MorphAnalyzer()
        self.stemmer = Stemmer.Stemmer('russian')
        try:
            self.emo_db = json.load(open('pyalchemy/emoji_database','r'))
        except:
            print('No emoji database found')


    def processContents(self, myText):
        myText = re.sub(r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))', '', myText)
        words = [word for word in re.findall(r'(?u)[@|#]?\w+', myText) if not word.startswith(('@','#'))]
        words = self.stemmer.stemWords(words)
        return words
        
    def resolveEmoji(self, myText):
        emostr = []
        emo_db = self.emo_db
        b = myText.encode('unicode_escape').split('\\')
        c = [point.replace('000','+').upper() for point in b if len(point) > 8 and point[0] == 'U']
        [emostr.append(emo_db[emo[:7]]) for emo in c if emo[:7] in emo_db]
        return myText

textProcessor = TweetTextParser()

Invoking Processor...


In [14]:
from time import time
morph = pymorphy2.MorphAnalyzer()
stemmer = Stemmer.Stemmer('russian')

#Cleaning up the data
t0 = time()
terms = []
n = len(data.index)
#n = 100000
for i in range(n):
    terms.append(textProcessor.processContents(data.content_lower[i]))
print time()-t0

31.3677601814


In [159]:
data['terms'] = terms[:]

In [160]:
for row in data.index[:10]: 
    for word in data['terms'][row]: print word

Ну
кто
ещ
попадет
на
инвентаризац
есл
не
я
пожалуйст
пуст
утр
все
эт
окажет
дурн
сном
Смотр
я
в
Москв
Сейчас
9
ясн
Утром
7
облачн
небольш
дожд
Днем
11
пасмурн
дожд
Вечер
Аа
ты
в
Москв
Немн
приятн
момент
Ну
взяв
тво
ручк
в
сво
несильн
сжима
да
да
я
в
Москв
Росс
ахаххах
аргументн
аргумент


In [22]:
from collections import Counter
import nltk

words = []
for i in range(n):
    words += terms[i]

fdist = nltk.FreqDist(words)
sortedDist = sorted(fdist.items(), key=operator.itemgetter(1),reverse=True)

sortedDist = [x for x in sortedDist if len(x[0]) > 2]
#allTheWords = [x for x in moreThan3 if x[1] > 1]
interestingVocab = [x[0] for x in sortedDist]
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  262391


In [23]:
for i in range(10):
    print interestingVocab[i]

что
так
как
Москв
мен
все
мне
был
теб
прост


In [24]:
#Find TF-IDF
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

trainingList = []
for i in range(n):
    trainingList.append(' '.join(data['terms'][i]))

tfidf_vectorizer = TfidfVectorizer(vocabulary = interestingVocab)
tfidf_matrix_train = tfidf_vectorizer.fit_transform(trainingList)  #finds the tfidf score with normalization

In [137]:
import time
def findSimilarTweets(queryTweet, data, threshold, maxNumber = 0, log = True):
    print 'Query:', queryTweet
    
    processedTweet = ' '.join(textProcessor.processContents(queryTweet))
    queryTweetRepresentation = tfidf_vectorizer.transform([processedTweet])
    
    start_time = time.time()
    cosine_similarities = cosine_similarity(queryTweetRepresentation, tfidf_matrix_train)[0]
    totalMatchingTweets = len(cosine_similarities[cosine_similarities>threshold])
    if maxNumber:
        totalMatchingTweets = min(totalMatchingTweets, maxNumber)
    indices = cosine_similarities.argsort()[::-1][:totalMatchingTweets]
    elapsed_time = time.time() - start_time
    if (log):
        print 'time: ', elapsed_time
        print ''
        print 'Results:'
        print data[indices]
        print ''
        print 'cosine scores ==>', cosine_similarities[indices]
    
    return indices

In [139]:
#before
inidiciesOfTweets = findSimilarTweets(u'в метро объявляют станции', data['content'], 0.5, log = False)

Query: в метро объявляют станции


In [156]:
from datetime import datetime
data['date'] = [datetime.strptime(data.created_at[i], "%Y-%m-%d %H:%M:%S").date() for i in data.index]
data['time'] = [datetime.strptime(data.created_at[i], "%Y-%m-%d %H:%M:%S").time() for i in data.index]

In [74]:
print 'Vocab Length: ', len(interestingVocab)

Vocab Length:  15268


In [75]:
print data['cleanText'][0]

Ребята, мы всем рады!


In [53]:
len(trainigList)

673685

In [2]:
from scipy.sparse import csr_matrix

print('hi')

hi


In [45]:
from sparselsh import LSH
from scipy.sparse import csr_matrix

t0 = time()
lsh = LSH( 100,
           tfidf_matrix_train.shape[1],
           num_hashtables=1,
           storage_config={"dict":None})
print time()-t0

NameError: name 'tfidf_matrix_train' is not defined

In [None]:
for ix in xrange(tfidf_matrix_train.shape[0]):
    x = tfidf_matrix_train.getrow(ix)
    lsh.index(x)
    progress(ix,tfidf_matrix_train.shape[0])

6.470348%

In [168]:
def calcMedian(lst):
    import numpy
    return numpy.median(numpy.array(lst))

def checkSpatialDensity(indices):
    
    lats = filter(lambda a: a != 37.619899, data['lat'][indices])
    lons = filter(lambda a: a != 55.753301, data['long'][indices])
    nf = len(lats)
    
    ltm = median(lats)
    lns = median(lons)
    
    cluster = []
    for i in indices:
        if data['lat'][i] == '37.619899' and data['long'][i] == '55.753301':
            pass
        else:
            
            dist = (data['lat'][i]-ltm)*(data['lat'][i]-ltm) + (data['long'][i]-lns)*(data['long'][i]-lns)
            if dist < 0.0001:
                cluster.append(i)
                
    print nf, len(cluster)
            
    if len(cluster) > 0.07 * nf:
        x = median(filter(lambda a: a != 37.619899, data['lat'][cluster]))
        y = median(filter(lambda a: a != 55.753301, data['long'][cluster]))
        print 'Found a spatial cluster with', len(cluster), 'points', y, x
    else:
        print 'No spatial clustering detected'

In [169]:
def median(lst):
    import numpy
    return numpy.median(numpy.array(lst))

def checkSpatialDensity(indices):
    
    lats = filter(lambda a: a != 37.619899, data['lat'][indices])
    lons = filter(lambda a: a != 55.753301, data['long'][indices])
    nf = len(lats)
    
    ltm = median(lats)
    lns = median(lons)
    
    cluster = []
    for i in indices:
        if data['lat'][i] == '37.619899' and data['long'][i] == '55.753301':
            pass
        else:
            
            dist = (data['lat'][i]-ltm)*(data['lat'][i]-ltm) + (data['long'][i]-lns)*(data['long'][i]-lns)
            if dist < 0.0001:
                cluster.append(i)
            
    if len(cluster) > 0.07 * nf:
        x = median(filter(lambda a: a != 37.619899, data['lat'][cluster]))
        y = median(filter(lambda a: a != 55.753301, data['long'][cluster]))
        print 'Found a spatial cluster with', len(cluster), 'points', y, x
    else:
        print 'No spatial clustering detected'
        
def checkTimeDensity(indices):
    cluster = []
    days = []
    times = []
    
    for item in indices:
        days.append(int(data['created_at'][item].split(' ')[0].split('-')[2]))
        times.append(float(data['created_at'][item].split(' ')[1][0:5].replace(':','.')))
        
    tm = median(times)
    dm = median(days)

    for item in indices:
        day = int(data['created_at'][item].split(' ')[0].split('-')[2])
        time = float(data['created_at'][item].split(' ')[1][0:5].replace(':','.'))
        if abs(day - dm) < 3:
            if abs(time - tm) < 3:
                cluster.append(item)
                
    for item in cluster:
        days.append(int(data['created_at'][item].split(' ')[0].split('-')[2]))
        times.append(float(data['created_at'][item].split(' ')[1][0:5].replace(':','.')))
    
    tm = str(median(times)).replace('.',':')
    dm = int(median(days))
                
    print 'Found a temporal cluster with', len(cluster), 'points', dm, tm
            

In [172]:
inidiciesOfTweets = findSimilarTweets(u'Bosco', data['content'], 0.5, log = False)
checkSpatialDensity(inidiciesOfTweets)
checkTimeDensity(inidiciesOfTweets)
neighBoors = data.loc[inidiciesOfTweets]

Query: Bosco
Found a spatial cluster with 47 points 55.77115 37.609414
Found a temporal cluster with 35 points 23 16:01


In [174]:
neighBoors.lat


384598    37.617680
82023     37.620712
157464    37.621531
634921    37.609414
590183    37.609414
585592    37.609414
611062    37.609414
632540    37.609414
618782    37.609414
580885    37.609414
585457    37.609414
629721    37.609414
634571    37.609414
583844    37.609414
628873    37.609414
583655    37.609414
618592    37.609414
618873    37.609414
586286    37.608567
583816    37.608276
579099    37.610418
587091    37.608567
582297    37.608754
582684    37.608626
586402    37.608567
582372    37.608654
582966    37.609414
588853    37.609377
583993    37.609414
588683    37.609414
588694    37.609414
636654    37.609414
635104    37.609414
586753    37.609414
629333    37.609414
584665    37.609414
640449    37.609414
635453    37.609414
634036    37.608228
633603    37.608228
631684    37.608228
633380    37.608228
628816    37.608299
626208    37.608828
633608    37.608228
626875    37.608299
631772    37.608228
587086    37.609414
583212    37.609414
599490    37.584960
