In [None]:
# Dependencias

import io
import re
import os
import math
import json
import time
import tweepy
from glob import glob
from copy import deepcopy
from datetime import datetime

In [None]:
# Funciones generales

def get_config(key):
    jsonfile = open('config.json').read()
    config = json.loads(jsonfile)
    if key in config:
        return config[key]

def read_json(filepath):
    jsonfile = io.open(filepath, encoding='utf8').read()
    return json.loads(jsonfile)

def save_json(filepath, content):
    path = os.path.dirname(filepath)
    if path: os.makedirs(path, exist_ok=True)
    jsonfile = io.open(filepath, 'w', encoding='utf8')
    jsoncontent = json.dumps(content, ensure_ascii=False)
    jsonfile.write(jsoncontent)
    jsonfile.close()

In [None]:
# Funciones de limpieza de texto

def parse_unescaped(text):
    text = text.replace("&lt;", '<')
    text = text.replace("&gt;", '>')
    text = text.replace("&amp;", '&')
    return text

def only_ascii(text):
    pattern = r'[^\x00-\xFFF]' # ASCII extendido
    text = re.sub(pattern, '', text, flags=re.UNICODE|re.MULTILINE)
    return text

def quit_multiples(char, text):
    pattern = "["+char+"]{2,}" # 2 o mas apariciones seguidas del caracter
    text = re.sub(pattern, char, text, flags=re.UNICODE|re.MULTILINE)
    return text

def quit_excesses(text):
    text = quit_multiples(' ', text)
    text = quit_multiples('\n', text)
    return text.strip() # Quita espacios y saltos de linea al principio o al final

def clean_text(text):
    text = parse_unescaped(text)
    text = only_ascii(text)
    text = quit_excesses(text)
    return text

In [None]:
# Funciones para parsear lo que devuelve la API de Twitter

def expand_mentions(tweet_raw):
    # Si la cuenta mencionada ya no existe o ha cambiado de nick, no es posible expandir la mencion
    items = tweet_raw['entities']['user_mentions']
    for item in items:
        tweet_raw['text'] = re.sub('@'+item['screen_name'], item['name'].replace('\\', '/'), tweet_raw['text'], flags=re.UNICODE|re.MULTILINE|re.IGNORECASE)
    return tweet_raw

def quit_media(tweet_raw):
    if ('media' in tweet_raw['entities']):
        items = tweet_raw['entities']['media']
        for item in items:
            tweet_raw['text'] = tweet_raw['text'].replace(item['url'], '')
    return tweet_raw

def quit_urls(tweet_raw):
    items = tweet_raw['entities']['urls']
    for item in items:
        tweet_raw['text'] = tweet_raw['text'].replace(item['url'], '')
    return tweet_raw

def clean_entities(tweet_raw):
    tweet_raw = expand_mentions(tweet_raw)
    tweet_raw = quit_media(tweet_raw)
    tweet_raw = quit_urls(tweet_raw)
    return tweet_raw

def clean_tweet(tweet_parsed):
    # Se quitan emojis, otros caracteres raros y los excesos de las limpizas
    tweet_parsed['text'] = clean_text(tweet_parsed['text'])
    tweet_parsed['user']['name'] = clean_text(tweet_parsed['user']['name'])
    return tweet_parsed

def parse_tweet(tweet):
    tweet_raw = deepcopy(tweet._json) # Asignamos el valor del objeto, no la referencia al mismo
    tweet_parsed = {'retweet': False}
    tweet_parsed['reply'] = False
    tweet_parsed['id'] = tweet_raw['id']
    tweet_parsed['date'] = datetime.strptime(tweet_raw['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%d-%m-%Y %H:%M:%S')
    tweet_parsed['lang'] = tweet_raw['lang']
    userId = tweet_raw['user']['id']
    userNick = tweet_raw['user']['screen_name']
    userName = tweet_raw['user']['name']
    if 'retweeted_status' in tweet_raw:
        tweet_parsed['retweet'] = True
        tweet_raw = tweet_raw['retweeted_status'] # Nos quedamos con el tuit original
    if (tweet_raw['in_reply_to_status_id'] or tweet_raw['in_reply_to_user_id']): tweet_parsed['reply'] = True
    if ('full_text' in tweet_raw): tweet_raw['text'] = tweet_raw['full_text'] # Generalizamos
    tweet_raw = clean_entities(tweet_raw)
    tweet_parsed['text'] = tweet_raw['text']
    tweet_parsed['user'] = {'id': userId, 'nick': userNick, 'name': userName}
    return clean_tweet(tweet_parsed)

def parse_account(account):
    account_raw = account._json
    account_parsed = {}
    account_parsed['id'] = account_raw['id']
    account_parsed['name'] = account_raw['name']
    account_parsed['nick'] = account_raw['screen_name']
    account_parsed['created'] = datetime.strptime(account_raw['created_at'], '%a %b %d %H:%M:%S +0000 %Y').strftime('%Y-%m-%d')
    account_parsed['language'] = account_raw['lang']
    account_parsed['protected'] = account_raw['protected']
    account_parsed['verified'] = account_raw['verified']
    account_parsed['followers'] = account_raw['followers_count']
    account_parsed['following'] = account_raw['friends_count']
    account_parsed['tweets'] = account_raw['statuses_count']
    account_parsed['favourites'] = account_raw['favourites_count']
    return account_parsed

In [None]:
# Funciones para trabajar en local con Twitter

def users_to_obtain():
    file_list = io.open('users.list', encoding='utf8').read()
    return file_list.splitlines()

def path_tweet(tweet):
    userId = tweet.user.id_str
    userNick = tweet.user.screen_name
    tweetId = tweet.id_str
    tweetDate = tweet.created_at.strftime('%Y-%m-%d_%H.%M.%S')
    return ("%s@%s\\%s_%s.json" % (userId, userNick, tweetDate, tweetId))

def save_tweet(tweet):
    filepath = path_tweet(tweet)
    tweet_parsed = parse_tweet(tweet)
    save_json('data\\tweets\\'+filepath, tweet_parsed)

def save_tweets(tweets):
    for tweet in tweets:
        save_tweet(tweet)

def save_account(account):
    account_parsed = parse_account(account)
    filepath = 'data\\accounts\\%d@%s.json' % (account_parsed['id'], account_parsed['nick'])
    save_json(filepath, account_parsed)

def list_users_obtained():
    users = []
    folders = glob("data\\tweets\\*\\")
    for folder in folders:
        user = re.search('.*\\\\\d*@(.*)\\\\', folder).groups()[0]
        users.append(user)
    return users

def get_user_path(user_nick):
    folders = glob("data\\tweets\\*@%s\\" % user_nick)
    if (len(folders) == 0):
        return None
    else:
        return folders[0]

def load_user_tweets(user_nick):
    folder = get_user_path(user_nick)
    if folder:
        files = glob(folder+"*.json")
        for file in files:
            yield read_json(file) # Se carga en memoria conforme se itera sobre la llamada

def count_user_tweets(user_nick):
    folder = get_user_path(user_nick)
    if folder:
        files = glob(folder+"*.json")
        return len(files)
    else:
        return 0

def last_id_obtained(user_nick):
    folder = get_user_path(user_nick)
    if folder:
        files = glob(folder+"*.json")
        if files:
            return read_json(files[-1])['id']
        else:
            return None
    else:
        return None

In [None]:
# Funciones para interactuar con la API de Twitter

def tw_api():
    credentials = get_config("twitter")
    auth = tweepy.OAuthHandler(credentials['consumer_key'], credentials['consumer_secret'])
    auth.set_access_token(credentials['access_key'], credentials['access_secret'])
    api = tweepy.API(auth)
    return api

# La API permite 75 llamadas a este metodo en cada ventana de 15 min
def check_auth():
    try:
        response = tw_api().verify_credentials()
        if response:
            status = True
        else:
            status = False
    except:
        status = False
    return status

# La API permite 180 llamadas a este metodo en cada ventana de 15 min
def remaining_limits():
    current_limits = {}
    response = tw_api().rate_limit_status()
    current_limits['credentials'] = response['resources']['account']['/account/verify_credentials']
    current_limits['limits'] = response['resources']['application']['/application/rate_limit_status']
    current_limits['users'] = response['resources']['users']['/users/show/:id']
    current_limits['tweets'] = response['resources']['statuses']['/statuses/user_timeline']
    for limit in current_limits:
        remaining_requests = current_limits[limit]['remaining']
        remaining_time = math.ceil(current_limits[limit]['reset'] - datetime.now().timestamp()) # seconds
        current_limits[limit] = {'requests': remaining_requests, 'time': remaining_time}
    return current_limits

def wait_for_limits(requests_to_do=0):
    current_limits = remaining_limits()
    remaining_requests = current_limits['tweets']['requests']
    remaining_time = current_limits['tweets']['time']
    # Comprobamos que podemos hacer las peticiones previstas
    if (remaining_requests-requests_to_do < 0):
        return remaining_time + 5 # seconds
    # Comprobamos que no excedemos ningun otro limite
    for limit in current_limits:
        remaining_requests = current_limits[limit]['requests']
        remaining_time = current_limits[limit]['time']
        if (remaining_requests == 0):
            return remaining_time + 5 # seconds
    # Si no se excede ningun limite no es preciso esperar nada
    return 0

# La API permite 900 llamadas a este metodo en cada ventana de 15 min
def get_user(user_nick=None, user_id=None):
    try:
        user = tw_api().get_user(screen_name=user_nick, user_id=user_id)
        return user
    except:
        return None

def user_exists(user_nick=None, user_id=None):
    try:
        response = get_user(user_nick, user_id)
        if response:
            status = True
            save_account(response) # Se aprovecha a guardar la informacion relevante
        else:
            status = False
    except:
        status = False
    return status

# La API permite 900 llamadas a este metodo en cada ventana de 15 min
# En cada peticion 200 es el maximo numero de tuits que se pueden pedir
# Si devuelve menos de los tuits solicitados es porque hay retuits de cuentas privadas o que ya no existen
# Con este metodo solo se pueden llegar a obtener los ultimos 3200 tuits de un usuario (incluye retuits y respuestas)
# Usando la opcion tweet_mode='extended' devuelve en full_text el tuit sin truncar a los 140 caracteres
def get_user_tweets(user_nick=None, user_id=None, max_id=None, min_id=None, include_rts=True, include_replies=True):
    requests = 0
    api = tw_api()
    tweets_count = 0
    pending_tweets = True
    while pending_tweets:
        requests += 1
        new_tweets = api.user_timeline(screen_name=user_nick, user_id=user_id, count=200, max_id=max_id, since_id=min_id, include_rts=include_rts, exclude_replies=(not include_replies), tweet_mode='extended')
        save_tweets(new_tweets)
        tweets_count += len(new_tweets)
        print("+%d = %d obtained tweets (request %d)" % (len(new_tweets), tweets_count, requests), end='\r') #log
        if (len(new_tweets) != 0):
            max_id = new_tweets[-1].id - 1
        else:
            pending_tweets = False
    return tweets_count

In [None]:
# Funciones de alto nivel para trabajar con las cuentas de Twitter

def collect_user(user):
    if user_exists(user_nick=user):
        wait = wait_for_limits(18) # Maximo numero de peticiones por usuario
        if (wait != 0):
            while (wait > 0):
                print("Waiting %d seconds to limits reset  " % wait, end='\r')
                time.sleep(1)
                wait -= 1
        last_id = last_id_obtained(user)
        count_old = count_user_tweets(user)
        tweets = get_user_tweets(user_nick=user, min_id=last_id)
        print("+%d = %d obtained tweets in total from user @%s" % (tweets, tweets+count_old, user))
    else:
        print('@'+user, "doesn't seem to exist")

def collect_users(users):
    if check_auth():
        for user in users:
            collect_user(user)
    else:
        print("Invalid credentials, revise them")

def summary_users():
    users = list_users_obtained()
    for user in users:
        tweets = load_user_tweets(user)
        tweets_total = count_user_tweets(user)
        tweets_retweet = 0
        tweets_reply = 0
        for tweet in tweets:
            if tweet['retweet']:
                tweets_retweet += 1
            else:
                if tweet['reply']: tweets_reply += 1
        own_tweets = tweets_total - tweets_retweet
        tweets_publication = own_tweets - tweets_reply
        percent_retweet = round(100*tweets_retweet/tweets_total, 0)
        percent_own = round(100*own_tweets/tweets_total, 0)
        percent_reply = round(100*tweets_reply/own_tweets, 0)
        percent_publication = round(100*tweets_publication/own_tweets, 0)
        print("@%s: %d tweets [(%d%%) %d retweets, (%d%%) %d own tweets [(%d%%) %d replies, (%d%%) %d publications]]" % (user, tweets_total, percent_retweet, tweets_retweet, percent_own, own_tweets, percent_reply, tweets_reply, percent_publication, tweets_publication))

In [None]:
t0 = time.time()

users = users_to_obtain() # Cargamos la lista con los usuarios seleccionados
collect_users(users) # Recopilamos los tuits de cada uno de los usuarios de la lista

print(time.time()-t0, 'seconds')

In [None]:
summary_users() # Hacemos un resumen de los tuits obtenidos de cada usuario