In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys; sys.path.insert(0, '..')
import json

import tweepy
from tqdm import tqdm, trange

from paradeller.keys import (
    consumer_key, consumer_secret, access_token, accss_token_secret
)
from paradeller.stopwords import stopwords

## Scraping

In [3]:
from paradeller.scrape import api, get_tweets, display_status, format_status

In [5]:
res = get_tweets()

for status in res:
    display_status(status)

Knot at this time.
-- @squidpolitico

1144721637520134145
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I real life haven‚Äôt had anything to eat all day
-- @____Naye

1144721637503315968
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
ü§©üòÅüòÅ God is good all the time
-- @_brightskin_

1144721637473996806
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Damn ! Cencor gardens filled up.
-- @soulpee

1144721637432016897
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Every day is an opportunity to innovate. üî¨
-- @80AcresFarms

1144721637390147584
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Bretman rock is definitely my fave YouTuber
-- @okaymaryann

1144721637297840133
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I was born a leader
-- @marcobrownjr

1144721637230747648
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
just right is my favorite got7 song.
-- @softrenjxn

1144721637188784128
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
I finished c

## Data Collection

In [6]:
from paradeller.helper import load_archive, update_archive

In [7]:
archive = load_archive()
len(archive)

53898

## Pre-select tweets

In [11]:
def fmt_status_from_id(id_):
    status = api.get_status(id_)
    return format_status(status)

In [16]:
ids = [
    1144127521929224197,
    1144135638721036288,
    1144135638733557760,
    1144137985971183616,
    1144144954219999232,
    1144146074321317888,
    1144313057377546240,
    1144313057444847621
]

In [17]:
saved = []
for id_ in ids:
    try:
        s = fmt_status_from_id(id_)
        saved.append(s)
    except tweepy.TweepError as e:
        print(e)
        print(">> ", id_)

[{'code': 144, 'message': 'No status found with that ID.'}]
>>  1144127521929224197


In [18]:
saved

[{'id': 1144135638721036288,
  'text': 'time will be frozen for us',
  'author': 'mariuuhhhh',
  'time': '2019-06-27 06:49:27'},
 {'id': 1144135638733557760,
  'text': 'Is it me or is it a break up season?',
  'author': 'Linnneyyy',
  'time': '2019-06-27 06:49:27'},
 {'id': 1144137985971183616,
  'text': 'ive been gettin horny at the most inconvenient times lately wtf ü•¥üò´',
  'author': 'SoCvsh',
  'time': '2019-06-27 06:58:47'},
 {'id': 1144144954219999232,
  'text': 'Fuck work experience üñïüèæüñïüèæ',
  'author': 'juggz_19',
  'time': '2019-06-27 07:26:28'},
 {'id': 1144146074321317888,
  'text': 'All you can do is watch.',
  'author': 'ayeroven',
  'time': '2019-06-27 07:30:55'},
 {'id': 1144313057377546240,
  'text': 'Piercing my belly üôá\u200d‚ôÄÔ∏è',
  'author': 'SuriouslyStace',
  'time': '2019-06-27 18:34:27'},
 {'id': 1144313057444847621,
  'text': 'did I mention i‚Äôm gay?',
  'author': 'hoeslynn',
  'time': '2019-06-27 18:34:27'}]

## Searching api for matches

In [13]:
import emoji
import string
from tqdm import tqdm, trange

In [14]:
def tokenize(tweet):
    tweet = ' '.join(emoji.get_emoji_regexp().split(tweet))
    words = tweet.split()
    return [
        w.lower().strip().translate(str.maketrans('', '', string.punctuation))
        for w in words
    ]

In [15]:
tokenize(saved[0]['text'])

['sitting',
 'in',
 'lingerie',
 'drinking',
 'wine',
 'ima',
 'wife',
 'for',
 'sure',
 'üò≠']

### Pick some tweets

In [16]:
s1 = saved[0]
s1

{'id': 1144127521929224197,
 'text': 'Sitting in lingerie drinking wine ima wife for sure üò≠',
 'author': '__mxvii',
 'time': datetime.datetime(2019, 6, 27, 6, 17, 12)}

In [17]:
s2 = saved[7]
s2

{'id': 1144313057444847621,
 'text': 'did I mention i‚Äôm gay?',
 'author': 'hoeslynn',
 'time': datetime.datetime(2019, 6, 27, 18, 34, 27)}

In [18]:
word_set = set(tokenize(s1['text']) + tokenize(s2['text']))

In [19]:
word_set

{'did',
 'drinking',
 'for',
 'gay',
 'i',
 'ima',
 'in',
 'i‚Äôm',
 'lingerie',
 'mention',
 'sitting',
 'sure',
 'wife',
 'wine',
 'üò≠'}

In [20]:
search_words = word_set - set(stopwords)
search_words

{'did',
 'drinking',
 'gay',
 'i',
 'ima',
 'i‚Äôm',
 'lingerie',
 'mention',
 'sitting',
 'sure',
 'wife',
 'wine',
 'üò≠'}

In [21]:
query = (
    ' OR '.join(search_words) + 
    " -filter:retweets -filter:links -filter:media"
)
print(query)

def get_search_tweets():
    tweets = api.search(
        q=query, lang='en', count=1000, include_entities=False
    )
    return [t for t in tweets if is_good(t.text)]

sure OR did OR üò≠ OR mention OR sitting OR wine OR i OR wife OR lingerie OR ima OR drinking OR gay OR i‚Äôm -filter:retweets -filter:links -filter:media


In [25]:
def potential_match(tweet, word_set):
    """
    If all words in tweet are in the word set,
    it is a potential match
    """
    words = tokenize(tweet)
    return set(words) <= word_set

In [34]:
iterations = 1000

matches = []

for i in trange(iterations):
    res = get_search_tweets()
    for status in res:
        if potential_match(status.text, word_set):
            print("yeah!!")
            display_status(status)
            mactches.append(status)

  7%|‚ñã         | 68/1000 [00:42<08:34,  1.81it/s]Rate limit reached. Sleeping for: 347
 25%|‚ñà‚ñà‚ñç       | 248/1000 [08:20<07:22,  1.70it/s]   Rate limit reached. Sleeping for: 793
 43%|‚ñà‚ñà‚ñà‚ñà‚ñé     | 428/1000 [23:29<05:28,  1.74it/s]    Rate limit reached. Sleeping for: 790
 61%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 608/1000 [38:37<04:04,  1.60it/s]    Rate limit reached. Sleeping for: 787
 79%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñâ  | 788/1000 [53:50<02:18,  1.53it/s]    Rate limit reached. Sleeping for: 779
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [1:09:07<00:00,  1.36it/s]   


In [36]:
matches

[]