# Counting occurrences


En esta notebook veremos cómo contar ocurrencias de los tweets

In [1]:
import json

tweets = json.load(open("data/tweets/buenosaires/001.json"))

Contamos las ocurrencias. No sumar el fd porque es lento ;-)

In [2]:

import re
import nltk
from nltk.tokenize import TweetTokenizer

urls = r'(?:https?\://t.co/[\w]+)'


tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True)


def mytokenize(text, only_alpha=True, remove_hashtags=True):
    tokens = tokenizer.tokenize(text)

    if only_alpha:
        tokens = [tk for tk in tokens if tk.isalpha()]
    else:
        tokens = [tk for tk in tokens if tk[0] != "#"] if remove_hashtags else tokens
        tokens = [tk for tk in tokens if not re.match(urls, tk)]
    tokens = [re.sub(r'(.)\1\1+', r'\1\1', tk) for tk in tokens]
    return tokens


In [3]:
%%time
"""
Calculemos el freqdist 
"""

from collections import defaultdict, Counter

fd = nltk.FreqDist()
users = defaultdict(set)

for tweet in tweets:
    text = tweet['text']
    tokens = mytokenize(text)
    for token in tokens:
        users[token].add(tweet['user']['id'])
        fd[token] += 1

CPU times: user 4.05 s, sys: 311 µs, total: 4.05 s
Wall time: 4.05 s


Usando todas estas opciones redujimos alrededor de 10K palabras (cerca de un 30%!) 

In [4]:
print("Cantidad de tokens = {}".format(fd.N()))
print("Cantidad de tokens únicos = {}".format(fd.B()))

Cantidad de tokens = 404321
Cantidad de tokens únicos = 27443


In [5]:
import pandas as pd

df = pd.DataFrame({"bsas_occ": fd, "bsas-usuarios": {k:len(v) for k, v in users.items()}})

In [6]:
df[df["bsas_occ"] > 10].shape

(3071, 2)

Haciendo todo Buenos Aires..


In [7]:
import glob
import os
import json

def prov_name(path):
    return os.path.basename(os.path.normpath(path))

provinces = [prov_name(path) for path in glob.glob("data/tweets/**/")]


path = "data/tweets/buenosaires/"

jsons = glob.glob(os.path.join(path, "*.json"))


In [8]:
%%time

import tracemalloc
import multiprocessing
import itertools
import pandas as pd
from collections import Counter, defaultdict

tracemalloc.start()

def get_fd(json_path):
    print("file {}".format(json_path))
    tweets = json.load(open(json_path))
    fd = nltk.FreqDist()
    users = defaultdict(set)
    
    for tweet in tweets:
        text = tweet['text']
        tokens = mytokenize(text)
        for token in tokens:
            fd[token] += 1
            users[token].add(tweet['user']['id'])
    return fd, users 


def get_province_df(province_name, jsons, no_workers=4):
    print("Processing {}".format(province_name))
    pool = multiprocessing.Pool(no_workers, maxtasksperchild=1)

    fds = pool.map(get_fd, jsons)

    fd = nltk.FreqDist()
    users = None 

    for (other_fd, users_freq) in fds:
        fd += other_fd
        if users is None:
            users = users_freq
        else:
            # Tengo que mergear los dicts de aquellas existentes
            for k in itertools.chain(users.keys(), users_freq.keys()):
                users[k] = users[k].union(users_freq[k])

    users_occurrences = {k:len(v) for k, v in users.items()}
    
    occurrences_column = "{}_ocurrencias".format(province_name)
    users_column = "{}_usuarios".format(province_name)
    
    print("Done. Building dataframe...")
    df = pd.DataFrame({occurrences_column: fd, users_column: users_occurrences})
    print("Done")
    return df

df = get_province_df("buenosaires", jsons)


Processing buenosaires
file data/tweets/buenosaires/029.json
file data/tweets/buenosaires/046.json
file data/tweets/buenosaires/024.json
file data/tweets/buenosaires/020.json
file data/tweets/buenosaires/038.json
file data/tweets/buenosaires/070.json
file data/tweets/buenosaires/055.json
file data/tweets/buenosaires/054.json
file data/tweets/buenosaires/052.json
file data/tweets/buenosaires/058.json
file data/tweets/buenosaires/014.json
file data/tweets/buenosaires/074.json
file data/tweets/buenosaires/004.json
file data/tweets/buenosaires/037.json
file data/tweets/buenosaires/050.json
file data/tweets/buenosaires/064.json
file data/tweets/buenosaires/044.json
file data/tweets/buenosaires/076.json
file data/tweets/buenosaires/047.json
file data/tweets/buenosaires/073.json
file data/tweets/buenosaires/008.json
file data/tweets/buenosaires/026.json
file data/tweets/buenosaires/019.json
file data/tweets/buenosaires/033.json
file data/tweets/buenosaires/034.json
file data/tweets/buenosaire

In [9]:
snapshot3 = tracemalloc.take_snapshot()

In [10]:
for statistic in snapshot3.statistics('lineno', cumulative=True)[:10]:
        print(statistic)

/home/jmperez/.pyenv/versions/3.6.4/lib/python3.6/multiprocessing/connection.py:251: size=19.4 MiB, count=339766, average=60 B
/home/jmperez/.pyenv/versions/3.6.4/envs/contrastes/lib/python3.6/site-packages/pandas/core/internals.py:4821: size=5309 KiB, count=2, average=2655 KiB
/home/jmperez/.pyenv/versions/3.6.4/envs/contrastes/lib/python3.6/site-packages/pandas/core/common.py:384: size=2655 KiB, count=10, average=265 KiB
<frozen importlib._bootstrap_external>:487: size=84.9 KiB, count=1041, average=83 B
/home/jmperez/.pyenv/versions/3.6.4/lib/python3.6/threading.py:846: size=6936 B, count=6, average=1156 B
/home/jmperez/.pyenv/versions/3.6.4/envs/contrastes/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py:432: size=3400 B, count=15, average=227 B
/home/jmperez/.pyenv/versions/3.6.4/lib/python3.6/multiprocessing/pool.py:144: size=2748 B, count=11, average=250 B
/home/jmperez/.pyenv/versions/3.6.4/lib/python3.6/tempfile.py:150: size=2600 B, count=2, average=1300 B
/home/jmperez/.

## DataFrame de todas las provincias

In [None]:
%%time
cfd = nltk.ConditionalFreqDist()

data = {}

for prov in provinces:
    json_paths = ["data/tweets/{}/00{}.json".format(prov, i ) for i in range(1, 5)]
    print("Procesando {}".format(prov))
    
    pool = multiprocessing.Pool(4)
    fds = pool.map(get_fd, json_paths)
    
    print("Sumando freqdists")
    
    fd = nltk.FreqDist()

    for ff in fds:
        fd += ff
        
    data[prov] = fd
    


Procesando larioja
Sumando freqdists
Procesando santiago
Sumando freqdists
Procesando formosa
Sumando freqdists
Procesando buenosaires
Sumando freqdists
Procesando neuquen
Sumando freqdists
Procesando santacruz
Sumando freqdists
Procesando cordoba
Sumando freqdists
Procesando entrerios
Sumando freqdists
Procesando misiones
Sumando freqdists
Procesando tierradelfuego
Sumando freqdists
Procesando santafe
Sumando freqdists
Procesando sanjuan
Sumando freqdists
Procesando catamarca


In [None]:
pd.DataFrame(data)