# Twitter

Twitter API allows to fetch live tweets by keyword, or to fetch a 1/6th sample of all twitter traffic.

In [5]:
import os

from dotenv import load_dotenv
load_dotenv()

CONSUMER_KEY = os.environ['CONSUMER_KEY']
CONSUMER_SECRET = os.environ['CONSUMER_SECRET']
TWITTER_TOKEN = os.environ['TWITTER_TOKEN']
TWITTER_SECRET = os.environ['TWITTER_SECRET']

In [6]:
import re

import twitter

# create new twitter stream
stream = twitter.TwitterStream(
    timeout=3600,
    heartbeat_timeout=3600,
    auth=twitter.OAuth(TWITTER_TOKEN,
                       TWITTER_SECRET,
                       CONSUMER_KEY,
                       CONSUMER_SECRET))

def get_tweets(terms):
    """
    stream all tweets (within 1/6th quota) indefinitely that contain any of the `terms`.
    
    `statuses.filter` doesn't always return correct matches, so we have to double-check ;)
    """
    # NOTE: using weird syntax instead of '\b' for non-word-boundaries (i.e. starting with '#')
    rex = re.compile(
        r'(?:^|[\W\D]|$)(?:%s)(?:^|[\W\D]|$)' % ('|'.join(re.escape(k) for k in terms),),
        flags=re.I)
    track = ','.join(terms)
    
    for tweet in stream.statuses.filter(track=track):
        text = tweet.get('extended_tweet', {}).get('full_text') or tweet.get('text')
        if tweet.get('lang') == 'en' and rex.search(text):
            yield tweet

### Tweet Filtering

We are looking for all current live tweets that contain the following keywords.
Let's see how long it will take to fetch 10 live tweets?

In [7]:
from itertools import islice

from tqdm import tqdm_notebook

terms = [
    '#BigData',
    '#MachineLearning',
    'machine learning',
    'deep learning',
    'artificial intelligence',
    'data science',
    '#ml',
    '#deeplearning',
    '#datascience',
    '#machinelearning',
    'natural language processing'
]

    
res = tqdm_notebook(get_tweets(terms))
res = list(islice(res, 10))

del stream  # stream.stop()?

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

### Tweet Structure

A tweet contains a lot of data and metadata:

In [8]:
import json

print(json.dumps(res[0], indent=4))

{
    "created_at": "Tue Sep 25 23:12:41 +0000 2018",
    "id": 1044726414572965888,
    "id_str": "1044726414572965888",
    "text": "Transforming Businesses with Artificial Intelligence https://t.co/r5e5mVIu0m",
    "source": "<a href=\"https://www.zift123.com\" rel=\"nofollow\">Zift123 Platform</a>",
    "truncated": false,
    "in_reply_to_status_id": null,
    "in_reply_to_status_id_str": null,
    "in_reply_to_user_id": null,
    "in_reply_to_user_id_str": null,
    "in_reply_to_screen_name": null,
    "user": {
        "id": 11975452,
        "id_str": "11975452",
        "name": "Jeremy Murtishaw",
        "screen_name": "murtishaw",
        "location": "\u00dcT: 33.790969,-84.391181",
        "url": "http://about.me/murtishaw",
        "description": "Technologist specializing in Healthcare IT, Network Design and Security, Cloud Computing, Hosting.  2020 Presidential Candidate.",
        "translator_type": "none",
        "protected": false,
        "verified": false,
        

### Matched Tweets

Let's look at some of the tweet texts we have captured:

In [9]:
import json
import pandas as pd
pd.options.display.max_colwidth = 280


def prep_tweets(tweets):
    """
    extract fields of interests from tweets
    """
    for tweet in tweets:
        yield {
            'text': tweet.get('extended_tweet', {}).get('full_text') or tweet.get('text'),
            'author': tweet.get('user', {}).get('screen_name'),
            #'lang': tweet.get('lang'),
            # 'urls': [x.get('expanded_url') or x.get('url') for x in tweet.get('entities', {}).get('urls')]
        }

        
prepped = list(prep_tweets(res))
pd.DataFrame(prepped)

Unnamed: 0,author,text
0,murtishaw,Transforming Businesses with Artificial Intelligence https://t.co/r5e5mVIu0m
1,CyberSecManaged,Transforming Businesses with Artificial Intelligence https://t.co/8AmvarmQzL
2,fortify_24x7,Transforming Businesses with Artificial Intelligence https://t.co/I9MmHk4Z4x
3,rstatstweet,RT @Rbloggers: R developer’s guide to Azure https://t.co/w5xEi85UAb #rstats #DataScience
4,everwood_lynn,"RT @intel: Preserving the Great Wall is an almost impossible task, one that our team is attempting to solve using artificial intelligence a…"
5,TechnoJeder,RT @SacTechEvents: Who's going to the Sacramento Artificial Intelligence meetup on Sunday? Join us: https://t.co/MWDeTLczDh
6,TechnoJeder,RT @AccendNetworks: Transforming Businesses with Artificial Intelligence https://t.co/TFk3Sz7nLY
7,datatalentrec,Should We Be Afraid Of Artificial Intelligence?\n#MachineLearning #ArtificialInteligence #Robots #machineintelligence \nhttps://t.co/QgqW2itVXS https://t.co/ZbqDYqlzA3
8,TechnoJeder,RT @neocompsystems: Transforming Businesses with Artificial Intelligence https://t.co/dbY1zUU3gn
9,clive140,"RT @MikeQuindazzi: Next-gen #SelfDrivingCars see trouble around the corner, literally! &gt;&gt; @MikeQuindazzi &gt;&gt; #AI #MachineLearning #DeepLearn…"


### Hashtags

Find which hashtags occur most frequently in our matched tweets

In [11]:
import gzip
import json
from collections import Counter
import pandas as pd


def get_tweets():
    """
    load matched tweets from file
    """
    with gzip.open('data/machine_learning-2018-09-26.jl.gz', 'rt') as fhandle:
        for line in fhandle:
            yield json.loads(line)

            
def hashtags(elt):
    """
    get all hashtags from deep within tweet (extended_tweet, retweets, etc.)
    """
    if isinstance(elt, list):
        for x in elt:
            yield from hashtags(x)
    elif isinstance(elt, dict):
        if 'hashtags' in elt:
            yield [x.get('text').lower() for x in elt['hashtags']]
        for v in elt.values():
            yield from hashtags(v)
            
            
tweets = list(islice(get_tweets(), 1000))
hashtags_per_tweet = [sum(hashtags(tweet), []) for tweet in tweets]
all_hashtags = sum(hashtags_per_tweet, []) 
ctr = Counter(all_hashtags)
pd.DataFrame(ctr.most_common(30))

Unnamed: 0,0,1
0,ai,771
1,bigdata,555
2,machinelearning,532
3,iot,344
4,datascience,339
5,deeplearning,294
6,ml,198
7,artificialintelligence,169
8,fintech,165
9,blockchain,138


### Leveraging Twitter Data

Based on the Twitter data we can generate many potential use cases:

* [Matched Tweets per Day](https://fluquid.com:5000/twitter)
* [DeepMoji](https://github.com/bfelbo/DeepMoji)
* Find job offerings
* Build a social graph of tweeters
* Engage with trending tweets, conversations in your niche
* Build live language models, sentiment analysis i.e. for elections
* etc.