In [97]:
import os
from twitter_to_sqlite import utils
import pandas as pd

In [98]:
auth = json.load(open("../auth/auth.json"))
owner_id = "143058191"
session = utils.session_for_auth(auth)

## utils

In [100]:
def count_collection(collection_id, session=session):
    url = f"https://api.twitter.com/1.1/collections/entries.json?id={collection_id}&count=200"
    response = session.get(url)
    collection_tweets = response.json()
    try:
        collection_tweets = list(collection_tweets["objects"]["tweets"])
        if len(collection_tweets) < 100:
            print(f"{collection_id} contains {len(collection_tweets)} tweets")
        else:
            print(f"{collection_id} contains more then 100 tweets")
        return len(collection_tweets)
    except:
        print(f"{collection_id} contains 0 tweets")
        return 0


def get_list_id(owner_id, list_name, session=session):
    url = f"https://api.twitter.com/1.1/lists/list.json?user_id={owner_id}"
    response = session.get(url)
    for l in response.json():
        if l["name"] == list_name:
            return l["id"]


def get_collection_id(owner_id, collection_name, session=session):
    url = f"https://api.twitter.com/1.1/collections/list.json?user_id={owner_id}"
    response = session.get(url)
    collections = response.json()["objects"]["timelines"]
    for k in collections.keys():
        if collections[k]["name"] == collection_name:
            return k


def err_handling(response, sleep=60):
    while response.reason != "OK":
        print(response.reason)
        if response.reason == "Too Many Requests":
            print(f"Rate limit error - waiting for {sleep} seconds")
            time.sleep(sleep)
        else:
            if "errors" in response.json():
                print(response.json()["errors"][0]["message"])
            elif "error" in response.json():
                print(response.json()["error"])
            else:
                print(response.json())
            raise Exception(f"Status code: {response.status_code}")


def rem_from_collection(collection_id, session=session):
    url = f"https://api.twitter.com/1.1/collections/entries.json?id={collection_id}&count=200"
    response = session.get(url)
    collection_tweets = response.json()
    try:
        collection_tweets = list(collection_tweets["objects"]["tweets"])
    except:
        print(f"{collection_id} contains 0 tweets")
    for t in collection_tweets:
        url = f"https://api.twitter.com/1.1/collections/entries/remove.json?id={collection_id}&tweet_id={t}"
        response = session.post(url)
        err_handling(response)
    count_collection(collection_id)


def processing_list(collection_id, tweet_list):
    collection_id = collection_id
    procc_list = []
    for counter, tweet_id in enumerate(tweet_list):
        if (counter + 1) % 20 == 0:
            print(f"{(counter+1)} / {len(tweet_list)}")
        url = f"https://api.twitter.com/1.1/collections/entries/add.json?tweet_id={tweet_id}&id={collection_id}"
        response = session.post(url)
        err_handling(response)
        if response.reason == "OK":
            errors = response.json()["response"]["errors"]
            if len(errors) > 0:
                procc_list.append(
                    {"tweet_id": tweet_id, "err_reason": errors[0]["reason"]}
                )
            else:
                procc_list.append({"tweet_id": tweet_id, "err_reason": "no_errors"})
    df = pd.DataFrame(procc_list)
    print(df["err_reason"].value_counts())
    return df

## create_collection

In [101]:
# show collections
url = f"https://api.twitter.com/1.1/collections/list.json?user_id={owner_id}"
response = session.get(url)
def create_collection(owner_id, session=session):
    try:
        collections = response.json()["objects"]["timelines"]
        for k in collections.keys():
            print(k, collections[k]["name"])
        collections_list = [collections[k]["name"] for k in collections.keys()]
    except:
        collections = []
        print("no collections")
        collections_list = []

    collections_for_classification = ["custom_newsfeed", "not_relevant"]
    print("")
    for c in collections_for_classification:
        if c not in collections_list:
            print(f"[{c}] not in collections, creating new collection")
            url = f"https://api.twitter.com/1.1/collections/create.json?name={c}"
            session.post(url)
        else:
            print(f"collection [{c}] already exists")

In [102]:
url = f"https://api.twitter.com/1.1/collections/list.json?user_id={owner_id}"
response = session.get(url)
collection_tweets = response.json()

for collection_id in collections:
    count_collection(collection_id)

custom-1351555076024893440 contains more then 100 tweets
custom-1351093543385899011 contains more then 100 tweets


# add tweets to collection

## filter out 

In [6]:
# get members of list called `muted`

In [7]:
tweets_df = pd.read_csv("batch_to_add.csv")
# this df is already filtered - removed "new" tweets, remove news, removed "seen" tweets

In [72]:
muted_list = get_list_id(owner_id, "muted")
muted_list

1351254441798807555

In [73]:
url = f"https://api.twitter.com/1.1/lists/members.json?list_id={muted_list}&owner_id={owner_id}"
response = session.get(url)
muted_accounts = [i["id"] for i in response.json()["users"]]
len(muted_accounts)

15

In [12]:
tweets_df.shape

(1000, 3)

In [13]:
tweets_df[tweets_df["user"].isin(muted_accounts)].shape

(67, 3)

In [14]:
tweets_df = tweets_df[~tweets_df["user"].isin(muted_accounts)]
tweets_df.shape

(933, 3)

# add tweets from csv to collection

## remove tweets from custom newsweed

In [103]:
custom_newsfeed = get_collection_id(owner_id, "custom_newsfeed")
custom_newsfeed

'custom-1351555076024893440'

In [104]:
collections[custom_newsfeed]

{'name': 'custom_newsfeed',
 'user_id': '143058191',
 'collection_url': 'https://twitter.com/saiko_grzegorz/timelines/1351555076024893440',
 'custom_timeline_url': 'https://twitter.com/saiko_grzegorz/timelines/1351555076024893440',
 'description': '',
 'url': 'https://twitter.com/saiko_grzegorz/timelines/1351555076024893440',
 'visibility': 'public',
 'timeline_order': 'curation_reverse_chron',
 'collection_type': 'user',
 'custom_timeline_type': 'user'}

In [105]:
count_collection(custom_newsfeed)

custom-1351555076024893440 contains more then 100 tweets


139

In [38]:
custom_newsfeed

'custom-1351555076024893440'

In [55]:
while count_collection(custom_newsfeed) > 0:
    rem_from_collection(custom_newsfeed)

custom-1351555076024893440 contains 0 tweets


# add n tweets from batch to collection

In [57]:
tweet_list = tweets_df["id"].tolist()[:300]

In [106]:
tweet_list = ["1335672354849091589"]

In [107]:
df = processing_list(custom_newsfeed, tweet_list)

not_found    1
Name: err_reason, dtype: int64


## to csv

In [None]:
# tweets_df["id"][:200].to_csv('seen.csv', mode='a', header=False)

In [62]:
len(tweet_list)

300

In [63]:
seen_tweets_old = pd.read_csv("seen.csv")

In [64]:
seen_tweets_old.to_csv("seen_old.csv", index=False)

In [65]:
df.to_csv("seen.csv", mode="a", header=False, index=False)

In [67]:
seen_tweets.head()

Unnamed: 0,tweet_id,err_reason
0,1337875267096875015,no_errors
1,1335698027537969155,no_errors
2,1345699105385959435,not_found
3,1336874318882467842,no_errors
4,1337717825629253633,not_found


In [68]:
seen_tweets["err_reason"].value_counts()

no_errors          1857
not_found           608
protected_tweet      35
Name: err_reason, dtype: int64

In [None]:
# save not_relevant

In [84]:
from datetime import datetime
import json

In [75]:
def get_collection_list(collection_id, session=session):
    url = f"https://api.twitter.com/1.1/collections/entries.json?id={collection_id}&count=200"
    response = session.get(url)
    collection_tweets = response.json()
    try:
        collection_tweets = list(collection_tweets["objects"]["tweets"])
        return collection_tweets
    except:
        print(f"{collection_id} contains 0 tweets")
        return []

In [79]:
not_relevant_list = get_collection_list(get_collection_id(owner_id, "not_relevant"))
with open(f"{datetime.now():%Y_%m_%d_%H%M}_not_relevant_list.txt", "w") as f:
    f.write(json.dumps(not_relevant_list))

In [96]:
!ls

 2021_01_22_1547_not_relevant_list.txt
'2nd batch.csv'
 3nd_batch.csv
 4nd_batch.csv
 5nd_batch.csv
'add sleep to function.ipynb'
'auth and get friends.ipynb'
 auth.json
 batch_to_add.csv
 cli_click.py
 collections.ipynb
 create_news_domains_list.ipynb
 db.ipynb
 db.py
 dbsw.py
 df.csv
'embed tweet.ipynb'
'extract from url.ipynb'
 first_batch.csv
'grab 1000 tweets.ipynb'
'hud queries.ipynb'
 identify_news.ipynb
 __init__.py
 news_domains.txt
 news_sites.csv
'news sites domains.ipynb'
 old_db.py
 old_utils.py
 plan.md
'process seen tweets.ipynb'
 __pycache__
'save profile.ipynb'
 second_batch.csv
 seen.csv
 seen_old.csv
 t
 training.db
'twitter api 2 context annotations.ipynb'
 twitter.db
 uci-news-aggregator.csv
 Untitled.ipynb
'upload to collection.ipynb'
