In [None]:
# Dependencias

import os
import io
import re
import json
import time
import unidecode
from glob import glob
from copy import deepcopy
from operator import itemgetter
from difflib import SequenceMatcher
from google.cloud import language
from google.cloud.language import enums
from google.cloud.language import types
from google.oauth2 import service_account
from google.protobuf.json_format import MessageToDict 

In [None]:
# Funciones generales

def get_config(key):
    jsonfile = open('config.json').read()
    config = json.loads(jsonfile)
    if key in config:
        return config[key]

def read_json(filepath):
    jsonfile = io.open(filepath, encoding='utf8').read()
    return json.loads(jsonfile)

def save_json(filepath, content):
    path = os.path.dirname(filepath)
    if path: os.makedirs(path, exist_ok=True)
    jsonfile = io.open(filepath, 'w', encoding='utf8')
    jsoncontent = json.dumps(content, ensure_ascii=False)
    jsonfile.write(jsoncontent)
    jsonfile.close()

In [None]:
# Funciones para trabajar con los tuits recopilados

def list_users_obtained():
    users = []
    folders = glob("data\\tweets\\*\\")
    for folder in folders:
        user = re.search('.*\\\\\d*@(.*)\\\\', folder).groups()[0]
        users.append(user)
    return users

def get_user_path(user_nick):
    folders = glob("data\\tweets\\*@%s\\" % user_nick)
    if (len(folders) == 0):
        return None
    else:
        return folders[0]

def load_user_tweets(user_nick):
    folder = get_user_path(user_nick)
    if folder:
        files = glob(folder+"*.json")
        for file in files:
            yield read_json(file) # Se carga en memoria conforme se itera sobre la llamada

def count_user_tweets(user_nick):
    folder = get_user_path(user_nick)
    if folder:
        files = glob(folder+"*.json")
        return len(files)
    else:
        return 0

def group_by_language(tweets):
    language_dict = {}
    for tweet in tweets:
        lang = tweet['lang']
        tweet_list = language_dict[lang] if (lang in language_dict) else []
        tweet_list.append(tweet)
        language_dict[lang] = tweet_list
    return language_dict

def reduce_languages(tweets_by_language, languages):
    language_dict = {}
    for lang in tweets_by_language:
        if lang in languages:
            language_dict[lang] = tweets_by_language[lang]
        else:
            other_lang = language_dict['others'] if 'others' in language_dict else []
            other_lang.extend(tweets_by_language[lang])
            language_dict['others'] = other_lang
    return language_dict

def compact_tweets(tweets):
    text = ""
    for tweet in tweets:
        text += tweet['text'].replace('\n', ' ') + '\n'
    return text

In [None]:
# Funciones para trabajar con las entidades obtenidas de los tuits

def path_entities(user):
    return get_user_path(user).replace("tweets", "entities")[:-1]

def user_processed(user):
    files = glob(path_entities(user)+"*.json")
    return len(files) > 0

def save_entities(entities, user, lang):
    filepath = path_entities(user) + '#' + lang.upper() + '.json'
    save_json(filepath, entities)

def load_entities(user):
    entities = []
    files = glob(path_entities(user)+"*.json")
    for file in files:
        content = read_json(file)
        if ('entities' in content):
            entities.extend(content['entities'])
    return entities

def relevant_entity(entity):
    entity = unidecode.unidecode(entity.lower())
    black_list = io.open('entities_black.list', encoding='utf8').read().splitlines()
    white_list = io.open('entities_white.list', encoding='utf8').read().splitlines()
    if (entity in map(str.lower, black_list)):
        return False
    elif ((len(entity) < 3) and (entity not in map(str.lower, white_list))):
        return False
    else:
        return True

def entities_to_interests(entities):
    interests = {}
    for entity in entities:
        mentions = entity['mentions']
        for mention in mentions:
            interest = mention['text']['content']
            counter = interests[interest] if interest in interests else 0
            counter += 1
            interests[interest] = counter
    #interests = sorted(interests.items(), key=itemgetter(1), reverse=True)
    interest_list = []
    for interest in interests:
        if relevant_entity(interest):
            interest_list.append({'entity': interest, 'count': interests[interest]})
    interest_list = sorted(interest_list, key=itemgetter('count'), reverse = True)
    return interest_list

In [None]:
# Funciones para trabajar con los intereses obtenidos de las entidades

def path_interests(user):
    return get_user_path(user).replace("tweets", "interests")[:-1] + '.json'

def interests_parsed(user):
    files = glob(path_interests(user))
    return len(files) > 0

def save_interests(interests, user):
    filepath = path_interests(user)
    save_json(filepath, interests)

def load_interests(user):
    filepath = path_interests(user)
    interests = read_json(filepath)
    return interests

def similar_interests(interest1, interest2):
    interest1 = interest1.lower() # Ralentiza un pelin
    interest2 = interest2.lower() # Ralentiza un pelin
    interest1 = unidecode.unidecode(interest1) # Algo lento
    interest2 = unidecode.unidecode(interest2) # Algo lento
    proximity = SequenceMatcher(None, interest1, interest2).ratio() # Muy lento
    return proximity

def group_similar_interests(interests):
    interests_group = []
    for interest in interests:
        max_similarity = 0
        max_interest = None
        for interest_group in interests_group:
            similarity = similar_interests(interest['entity'], interest_group['entity'])
            #print(similarity, interest['entity'].replace('\n', ' '), '-' , interest_group['entity'].replace('\n', ' '))
            if (similarity > max_similarity):
                #print(similarity, interest['entity'].replace('\n', ' '), '-' , interest_group['entity'].replace('\n', ' '))
                max_similarity = similarity
                max_interest = interest_group
        if (max_similarity >= 0.75):
            interest_grouped = deepcopy(max_interest)
            interest_grouped['count'] = max_interest['count'] + interest['count']
            interests_group.remove(max_interest)
            interests_group.append(interest_grouped)
            #print('+', max_similarity, interest['entity'].replace('\n', ' '), '-' , max_interest['entity'].replace('\n', ' '))
        else:
            interests_group.append(interest)
    interests_group = sorted(interests_group, key=itemgetter('count'), reverse = True)
    return interests_group

def remove_low_interest(interests):
    high_interests = []
    for interest in interests:
        if (interest['count'] > 1):
            high_interests.append(interest)
    return high_interests

def normalize_weights(interests):
    max_weight = interests[0]['count']
    for interest in interests:
        interest['weight'] = interest['count'] / max_weight
    return interests

In [None]:
# Funciones para interactuar con la API de Google Cloud

def gcloud_api():
    service_account_info = get_config('gcloud')
    credentials = service_account.Credentials.from_service_account_info(service_account_info)
    client = language.LanguageServiceClient(credentials=credentials)
    return client

def extract_entities(text):
    document = types.Document(content=text, type=enums.Document.Type.PLAIN_TEXT)
    response = gcloud_api().analyze_entities(document)
    return MessageToDict(response)

In [None]:
# Funciones de alto nivel para procesar las cuentas de los usuarios

def summary_language_users():
    users = list_users_obtained()
    for user in users:
        tweets = load_user_tweets(user)
        tweets_total = count_user_tweets(user)
        tweets_by_lang = group_by_language(tweets)
        langs = [key for key in tweets_by_lang]
        langs_list = []
        for lang in langs:
            percent = int(round(100*len(tweets_by_lang[lang])/tweets_total, 0))
            language = {'lang': lang, 'percent': percent}
            if (percent > 0): langs_list.append(language)
        langs_list = sorted(langs_list, key=itemgetter('percent'), reverse = True)
        langs = ['(%d%%) %s' % (language['percent'], language['lang']) for language in langs_list]
        print('@' + user + ': ' + ', '.join(langs))

def process_user_tweets(user):
    if (not user_processed(user)):
        tweets = load_user_tweets(user)
        tweets_by_lang = group_by_language(tweets)
        tweets_by_lang = reduce_languages(tweets_by_lang, ['es', 'en'])
        for lang in ['es', 'en', 'others']:
            try:
                print('@' + user + ': processing ' + lang.upper() + '                     ', end='\r')
                tweets_lang = tweets_by_lang[lang] if (lang in tweets_by_lang) else []
                entities_lang = extract_entities(compact_tweets(tweets_lang))
                save_entities(entities_lang, user, lang)
            except:
                print('@' + user + ': bad language processing ' + lang.upper() + '        ', end='\r')
    if (not interests_parsed(user)):
        print('@' + user + ': parsing interests                      ', end='\r')
        entities = load_entities(user)
        interests = entities_to_interests(entities)
        interests = group_similar_interests(interests)
        interests = remove_low_interest(interests)
        interests = normalize_weights(interests)
        save_interests(interests, user)
    else:
        interests = load_interests(user)
    density = round(len(interests) / count_user_tweets(user), 2)
    print('@' + user + ': %d interests found (%.2f interests per tweet)' % (len(interests), density))

def process_users_tweets():
    users = list_users_obtained()
    for user in users:
        process_user_tweets(user)

In [None]:
summary_language_users() # Hacemos un resumen de los lenguajes que usa cada usuario

In [None]:
t0 = time.time()

process_users_tweets() # Procesamos los tuits de todos los usuarios recopilados

print(time.time()-t0, 'seconds')