In [1]:
%config IPCompleter.use_jedi=False

In [2]:
%load_ext autoreload
%autoreload 2

# Tweepy experiments

In [3]:
import tweepy, json, sys
import networkx as nx
import pandas as pd
import numpy as np

In [4]:
sys.path.insert(0,"twittercrawler/egonet/")
from agents import *

## Load parameters

In [5]:
query = {
    "screen_name":"ferencberes91", 
    #"screen_name":"Istvan_A_Seres", 
    #"screen_name":"benrozemberczki",
    #"screen_name":"GodHatesNFTees",
    #"screen_name":"pxRobos",
    "tweet_mode":'extended'
}

In [6]:
with open("api_key.json") as f:
    credentials = json.load(f)

In [7]:
user_agent = LookupAgent(credentials, user_mode=True)

In [8]:
status_agent = LookupAgent(credentials, user_mode=False)

## Query followers and friends

friend_agent = QueryUserConnections(credentials)
follow_agent = QueryUserConnections(credentials, query_friends=False)

friend_ids = friend_agent.execute(query["screen_name"])
len(friend_ids)

user_agent.add(friend_ids)

follow_ids = follow_agent.execute(query["screen_name"])
len(follow_ids)

user_agent.add(follow_ids)

#### Notes: környezet felderítése
- friend-eket mindig lekérjük - gyorsan megy (jellemzi a user-t)
- followerek: itt csak azokat kérjük le akik az adatban több fiókot is követnek - (pl. MAC esetén nem fogunk 500K fiókot lekérdezni...)

In [9]:
class TimelineAgent(BaseAgent):
    def __init__(self, credentials, wait_on_rate_limit=True):
        super(TimelineAgent, self).__init__(credentials, wait_on_rate_limit, True)
        self._clear_cache()
    
    def _clear_cache(self):
        self.known_users = []
        self.known_user_ids = set()
        self.unknown_user_ids = set()
        self.relations = []
        self.statuses = []
        
    def _process_mentions(self, status):
        source_user_id = status["user"]["id"]
        status_id = status["id"]
        if "user_mentions" in status["entities"]:
            for mention in status["entities"]["user_mentions"]:
                target_user_id = mention["id"]
                mention_relation = {"status": status_id, "user": target_user_id, "type":"mention"}
                self.unknown_user_ids.add(target_user_id)
                self.relations.append(mention_relation)
                
    def _process_relations(self, status):
        source_user_id = status["user"]["id"]
        status_id = status["id"]
        if not source_user_id in self.known_user_ids:
            self.known_users.append(status["user"])
        self.known_user_ids.add(source_user_id)
        author_relation = {"status": status_id, "user": source_user_id, "type":"author"}
        self.relations.append(author_relation)
        retweet = None
        if 'retweeted_status' in status:
            reweet = status['retweeted_status']
            retweet_relation = {"status": reweet['id'], "user": source_user_id, "type":"retweet"}
            self.relations.append(retweet_relation)
        return retweet
        #TODO: in_reply_to etc.
    """
    def _collect_retweets(self, status):
        status_id = status["id"]
        retweets = []
        try:
            retweets = timeline.api.get_retweeter_ids(id=status_id)
        except Exception as ex:
            print(ex)
        #finally:
        #    for user_id in retweets:
        # TODO: finish!!
    """ 
    def _handle_status(self, status):
        self.statuses.append(status)
        self._process_mentions(status)
        retweeted = self._process_relations(status)
        if retweeted != None:
            self._handle_status(retweeted)
    
    def _process_query(self, query_result):
        for tweepy_status in query_result:
            self._handle_status(tweepy_status._json)
        self.unknown_user_ids = self.unknown_user_ids.difference(self.known_user_ids)
        
    def _get_mention_results(self, query):
        # code below only works for authenticated user!
        #mentions_result = self.api.mentions_timeline(**query)
        result = self.client.get_user(username=query["screen_name"])
        mentions_result = []
        for response in tweepy.Paginator(self.client.get_users_mentions, result.data.id, max_results=100, limit=3):
            status_ids = [mention.id for mention in response.data]
            mentions_result += self.api.lookup_statuses(id=status_ids, tweet_mode='extended')
        #print(len(mentions_result))
        return mentions_result
    
    def make_user_query(self, query):
        self._clear_cache()
        mentions_result = self._get_mention_results(query)
        self._process_query(mentions_result)
        # TODO: pagination with old api!!!
        timeline_result = self.api.user_timeline(**query)
        self._process_query(timeline_result)

In [10]:
timeline = TimelineAgent(credentials)

In [11]:
timeline.make_user_query(query)
print(len(timeline.known_user_ids), len(timeline.unknown_user_ids), len(timeline.relations), len(timeline.statuses))

24 57 310 64


In [12]:
relations_df = pd.DataFrame(timeline.relations)

In [13]:
#TODO: add timestamp for edges!!!
def extract_network(relations_df):
    link_events = {
        "author":"red",
        "retweet":"green",
        "mention":"blue"   
    }
    event_dataframes = {}
    edges = []
    for event, color in link_events.items():
        event_df = pd.DataFrame(relations_df[relations_df["type"]==event])
        print(event_df.shape)
        edges += list(zip(event_df["user"], event_df["status"], [{"color":color}]*len(event_df)))
        event_dataframes[event] = event_df
    G = nx.MultiDiGraph()
    _ = G.add_edges_from(edges)
    print(G.number_of_edges(), G.number_of_nodes(), len(list(nx.connected_components(G.to_undirected()))))
    return G, event_dataframes

In [14]:
G, event_dataframes = extract_network(relations_df)

(64, 3)
(14, 3)
(232, 3)
310 155 1


### Some users and statuses are queried multiple times!!! - TODO: reduce duplication in queries

In [15]:
for event, event_df in event_dataframes.items():
    user_agent.add(list(event_df["user"]))
    status_agent.add(list(event_df["status"]))

In [16]:
len(user_agent), len(status_agent)

(310, 310)

In [17]:
users = user_agent.execute(force=True)
len(users)

112

In [18]:
statuses = status_agent.execute(force=True)
len(statuses)

131

In [19]:
replied_statuses = []
reply_edges = []
for status in statuses:
    status_id = status.id
    if status.in_reply_to_status_id != None:
        replied_status = status.in_reply_to_status_id
        reply_edges.append((status_id, replied_status, {"color":"black"}))
        replied_statuses.append(replied_status)
_ = G.add_edges_from(reply_edges)

In [20]:
status_agent.add(replied_statuses)

In [21]:
statuses += status_agent.execute(force=True)

In [22]:
nodes = G.nodes
user_meta = {}
for user in users:
    if user.id in nodes:
        user_meta[user.id] = user
print(len(user_meta))

status_meta = {}
for status in statuses:
    if user.id in nodes:
        status_meta[status.id] = status
print(len(status_meta))

81
80


In [23]:
labels, titles, shapes, values, titles, colors = {}, {}, {}, {}, {}, {}
for node in nodes:
    if node in user_meta:
        shapes[node] = "dot"
        values[node] = np.log2(1+user_meta[node].followers_count)
        labels[node] = user_meta[node].name
        titles[node] = user_meta[node].description
    elif node in status_meta:
        shapes[node] = "square"
        interactions = status_meta[node].favorite_count + status_meta[node].retweet_count
        values[node] = np.log2(1+interactions)
        labels[node] = status_meta[node].created_at.strftime("%Y-%m-%d")
        titles[node] = status_meta[node].full_text
    else:
        colors[node] = "red"
        print(node)
nx.set_node_attributes(G, labels, "label")
nx.set_node_attributes(G, shapes, "shape")
nx.set_node_attributes(G, values, "value")
nx.set_node_attributes(G, titles, "title")
nx.set_node_attributes(G, colors, "color")

In [24]:
from pyvis.network import Network
net = Network(notebook=True, height='1000px', width='1200px')
net.from_nx(G)

### Pagination is missing for a lot of endpoints!!!
### TODO: collect media as well!!!
### TODO: put it into database!!!
### TODO: graphvis does not work for large graphs - constant movement
### TODO: not all mentioned users are processed from statuses!!!

In [25]:
net.show("test.html")

timeline.statuses[-1]

#### User status is alwys up to date... for past tweets the past user state is hidden...

#### Finally query information on every user

retweets = []
try:
    retweets = timeline.api.get_retweeter_ids(id=737748119966879744)
except Exception as ex:
    print(ex)
finally:
    print(retweets)
    
#1240674204665741312)
#1241076920512434177)
#737748119966879744)

retweets

timeline.statuses[-1]

# Trekking trial

from datetime import datetime as dt 

with open("twittercrawler.json") as f:
    trekking_credentials = json.load(f)

query = "#dolomites has:geo"
start_time = "2018-09-01T0:00:00Z"
end_time = "2018-09-02T0:00:00Z"

res = client.search_all_tweets(query=query, start_time=start_time, end_time=end_time)

TODO: #dolomiti #dolomiten #mountains

In [26]:
client = timeline.client

In [27]:
for response in tweepy.Paginator(client.get_users_followers, 2244994945,
                                    max_results=10, limit=2):
    print(response.meta)

{'result_count': 10, 'next_token': '2KURHUMHVKD1GZZZ'}
{'result_count': 10, 'next_token': '0CMHN3JGTSD1GZZZ', 'previous_token': '55I5N94K0BIUEZZZ'}


In [28]:
response

Response(data=[<User id=1484210598133080068 name=Avisna username=Avisna22>, <User id=2207660990 name=Julien Perrochet username=0xTelegraph>, <User id=1508949068218712066 name=pick username=yanhua131>, <User id=1315984086188658691 name=Daily Dose of Nature username=DailyDONature>, <User id=66946490 name=panda_dy username=panda_dy>, <User id=357654084 name=Hasan username=hasanaribas>, <User id=1478555076855668736 name=Sri Lanka username=SriLankansWe>, <User id=1002635179406647296 name=Israel username=Israelorere>, <User id=1449140986408431616 name=8P Studios username=8pstudios>, <User id=1541379971192586250 name=Sami username=Sami1986shoja>], includes={}, errors=[], meta={'result_count': 10, 'next_token': '0CMHN3JGTSD1GZZZ', 'previous_token': '55I5N94K0BIUEZZZ'})

res = client.get_user(username="Istvan_A_Seres")
#mentions = client.get_users_mentions(res.data.id, max_results=5)

for response in tweepy.Paginator(client.get_users_mentions, res.data.id, max_results=5).flatten(limit=15):#, limit=20):
    #print(response.meta)
    print(response.id)

likes = client.get_liking_users(mention.id)
likes

In [29]:
client = tweepy.Client("Bearer Token here")

for response in tweepy.Paginator(client.get_users_followers, 2244994945,
                                    max_results=1000, limit=5):
    print(response.meta)

for tweet in tweepy.Paginator(client.search_recent_tweets, "Tweepy",
                                max_results=100).flatten(limit=250):
    print(tweet.id)

Unauthorized: 401 Unauthorized
Unauthorized