# Tablas tweets

Acá haremos todas las tablas sobre los tweets pelados

In [1]:
%matplotlib inline
import os
import glob

corpora_dir = "~/projects/corpora/tweets_argentinos"

provincias = glob.glob(os.path.expanduser(corpora_dir)+"/**/")

In [2]:
import contrastes.processing

prov_dir = provincias[0]

jsons = glob.glob(os.path.join(prov_dir, "*.json"))


In [3]:
from nltk import FreqDist
from contrastes.processing import tokenize

def get_counters(tweets):
    """
    Returns a pair (fd, users) for a collection of tweets

    Parameters
    ----------
    tweets: List of dicts

    Returns:
    -------

    A pair (fd, users) where

    fd: nltk.FreqDist
        Occurrences of tokens

    users: defaultdict(set) of tokens -> users
        Dictionary containing users of given tokens
    """
    fd = FreqDist()
    users = set()
    
    for tweet in tweets:
        text = tweet['text']
        tokens = tokenize(text)
        for token in tokens:
            fd[token] += 1
            users.add(tweet['user']['id'])
    return fd, users, len(tweets)

def get_counters_from_file(json_path):
    with open(json_path) as f:
        tweets = json.load(f)
        return get_counters(tweets)
    

In [5]:
%%time
import json
import multiprocessing
from contextlib import closing

def get_province_data(jsons):
    with closing(multiprocessing.Pool(6, maxtasksperchild=1)) as pool:
        res = pool.map(get_counters_from_file, jsons)

    # join
    fd = FreqDist()
    users = set({})
    no_tweets = 0
    for _fd, _users, _no_tweets in res:
        fd += _fd
        no_tweets += _no_tweets
        users.update(_users)
    
    return fd, users, no_tweets

data = {}

for prov_path in provincias:
    
    province = prov_path.split("/")[-2]
    print(province)
    jsons = glob.glob(os.path.join(prov_path, "*.json"))
    fd, users, no_tweets = get_province_data(jsons)
    
    data[province] = {}
    data[province]["fd"] = fd
    data[province]["users"] = users
    data[province]["no_tweets"] = no_tweets
    
    

larioja
santiago
formosa
buenosaires
neuquen
santacruz
cordoba
entrerios
misiones
tierradelfuego
santafe
sanjuan
catamarca
chaco
rionegro
chubut
sanluis
corrientes
lapampa
jujuy
tucuman
mendoza
salta
CPU times: user 1min 38s, sys: 13.6 s, total: 1min 52s
Wall time: 51min 33s


In [6]:
import pandas as pd

name_mapping = {k:k.capitalize() for k in data}
name_mapping.update({
    "larioja": "La Rioja",
    "buenosaires": "Buenos Aires",
    "santacruz": "Santa Cruz",
    "cordoba": "Córdoba",
    "entrerios": "Entre Ríos",
    "tierradelfuego": "Tierra del Fuego",
    "santafe": "Santa Fe",
    "rionegro": "Río Negro",
    "sanluis": "San Luis",
    "lapampa": "La Pampa",
    "tucuman": "Tucumán",
})
df = []

for prov in data:
    fd = data[prov]["fd"]
    users = data[prov]["users"]
    
    df.append({
        "Province": name_mapping[prov],
        "#Users": len(users),
        "#Tokens": fd.N(),
        "#Tweets": data[prov]["no_tweets"],
        "Vocabulary": fd.B()
    })

df = pd.DataFrame(df)

df.set_index("Province", inplace=True)

In [7]:
df

Unnamed: 0_level_0,#Tokens,#Tweets,#Users,Vocabulary
Province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
La Rioja,21890340,2450000,2415,322089
Santiago,26399523,3550000,2445,311933
Formosa,24714608,3300000,2449,292224
Buenos Aires,30102152,3800000,2429,364167
Neuquen,30579937,3900000,2421,341846
Santa Cruz,25862320,3100000,2426,316864
Córdoba,32068788,4050000,2470,371096
Entre Ríos,32030398,4100000,2443,342930
Misiones,24648155,3200000,2458,275875
Tierra del Fuego,28073095,3350000,2442,357389


In [8]:
df.to_latex("../output/tables/dataset_info.tex")
df.to_csv("../output/tables/dataset_info.csv")

In [7]:
import pandas as pd

df = pd.read_csv("../output/tables/dataset_info.csv")

df.set_index("Province", inplace=True)

df

Unnamed: 0_level_0,#Tokens,#Tweets,#Users,Vocabulary
Province,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
La Rioja,21890340,2450000,2415,322089
Santiago,26399523,3550000,2445,311933
Formosa,24714608,3300000,2449,292224
Buenos Aires,30102152,3800000,2429,364167
Neuquen,30579937,3900000,2421,341846
Santa Cruz,25862320,3100000,2426,316864
Córdoba,32068788,4050000,2470,371096
Entre Ríos,32030398,4100000,2443,342930
Misiones,24648155,3200000,2458,275875
Tierra del Fuego,28073095,3350000,2442,357389


In [21]:
df_brief = pd.DataFrame({"Total": df.sum(), "Mean": df.mean(), "STD": df.std()})

df_brief

Unnamed: 0,Mean,STD,Total
#Tokens,28142320.0,3325680.0,647273381
#Tweets,3517391.0,457167.3,80900000
#Users,2447.609,19.47665,56295
Vocabulary,328614.6,26191.65,7558135


In [22]:
df_brief.to_csv("../output/tables/dataset_brief_info.csv")
df_brief.to_latex("../output/tables/dataset_brief_info.tex", columns=["Total", "Mean", "STD"])
