# Filtering tweets for the web app

In [1]:
from twitter_scraping.twitter_scraping import TwitterScraper, TweetsFilter
import daiquiri
import sys
import logging
import os.path as path
import pandas as pd
import datetime

%load_ext autoreload

In [2]:
LOGS_DIR = '../logs/logs_tweepy/'
IDS_PATH = '../data/data_tweepy/twitter_ids_dict.json'

daiquiri.setup(
    level=logging.INFO,
    outputs=(
        daiquiri.output.Stream(sys.stdout),
        daiquiri.output.File(
            path.join(LOGS_DIR, 'update_tweets.log'),
            formatter=daiquiri.formatter.TEXT_FORMATTER
        )
    )
)

logger = daiquiri.getLogger(__name__)

In [3]:
twitter_scraper = TwitterScraper(
    logger,
    data_path='../data/data_tweepy/tweets_df.pkl',
    credentials_path='../.secret/credentials.ini',
    twitter_ids_dict_path='../data/data_tweepy/twitter_ids_dict.json'
)
twitter_scraper.load_tweets()

twitter_scraper.tweets_df.shape

(827, 8)

In [5]:
%autoreload 2

tweets_filter = TweetsFilter(twitter_scraper.tweets_df)

Check filtering by handles.

In [7]:
handles_list = ['dpalmisano', 'nvidia']

# tweets_filter.filter_by_company(scraper.tweets_df, companies_list)
tweets_filter.filter_tweets(companies_list=handles_list)

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweet,retweet_count,favorite_count
819,1056996903785623552,2018-10-29 19:51:14,"RT @NVIDIAAIDev: This week, @NVIDIA researcher...",61559439,nvidia,False,50,0
820,1055958243028066304,2018-10-26 23:03:58,RT @NVIDIADC: Today is the debut of @Livermore...,61559439,nvidia,False,52,0
821,1055890108451192832,2018-10-26 18:33:14,"RT @NVIDIAEmbedded: SF-based startup, @motionl...",61559439,nvidia,False,23,0
822,1055889820499636224,2018-10-26 18:32:05,RT @NVIDIAAIDev: Finding the perfect place to ...,61559439,nvidia,False,10,0
823,1055564757971812352,2018-10-25 21:00:24,"RT @NVIDIAAIDev: .@aivatechnology, the company...",61559439,nvidia,False,92,0
824,1055502591658868737,2018-10-25 16:53:22,"#GTC18, the largest AI event it the DC area, b...",61559439,nvidia,False,29,90
825,1055494200634949633,2018-10-25 16:20:02,RT @NvidiaAI: NVIDIA and @RedHat are simplifyi...,61559439,nvidia,False,60,0
826,1055494021173243904,2018-10-25 16:19:19,RT @NVIDIADC: Join us at #SC18 for a special a...,61559439,nvidia,False,28,0
0,1055389607645380609,2018-10-25 09:24:25,I've been trying to return an item to you and ...,14656799,dpalmisano,False,0,0
597,1055228806972096512,2018-10-24 22:45:27,RT @NvidiaAI: We’re honored NVIDIA’s “I am AI”...,61559439,nvidia,False,14,0


In [12]:
tweets_filter.filter_by_company(twitter_scraper.tweets_df, handles_list).shape==tweets_filter.filter_tweets(companies_list=handles_list, reindex=True).shape

True

Check filtering by keywords.

In [13]:
keywords_list = ['time', 'picture']

tweets_filter.filter_tweets(keywords_list=keywords_list)

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweet,retweet_count,favorite_count
799,1056584017112432641,2018-10-28 16:30:34,What’s in a pocket? 80-yard routes take time. ...,2803191,Intel,False,12,69
811,1056553734287904769,2018-10-28 14:30:14,"It's race day! This time, @ScuderiaFerrari wil...",14861876,AMD,False,14,90
800,1055979974841298944,2018-10-27 00:30:19,What’s in a shovel? It’s not the flashiest of ...,2803191,Intel,False,23,73
817,1055806319482101760,2018-10-26 13:00:17,It’s a quick turnaround and @ScuderiaFerrari i...,14861876,AMD,False,108,628
603,1054839314373271552,2018-10-23 20:57:45,RT @NVIDIAAIDev: NVIDIA GPU Cloud reduces the ...,61559439,nvidia,False,43,0
213,1054545542083166209,2018-10-23 01:30:24,"What’s in a quarterback? Sometimes, it’s findi...",2803191,Intel,False,16,74
246,1051815630641356800,2018-10-15 12:42:42,RT @cashcash: Awesome time supporting our frie...,2803191,Intel,False,14,0
629,1050794665916620801,2018-10-12 17:05:45,RT @NVIDIAEU: Thanks for joining us at GTC Eur...,61559439,nvidia,False,25,0
274,1049355024260894720,2018-10-08 17:45:08,#9thGen is transforming the gaming and enterta...,2803191,Intel,False,47,425
283,1049057563105361920,2018-10-07 22:03:08,There has never been a better time to own a de...,2803191,Intel,False,60,218


In [16]:
tweets_filter.filter_by_keyword(twitter_scraper.tweets_df, keywords_list).shape==tweets_filter.filter_tweets(keywords_list=keywords_list).shape

True

Check filtering by date range.

In [17]:
# dates_range = [pd.to_datetime('2018-10-15'), pd.Timestamp.today()]
dates_range = [pd.to_datetime('2018-10-15'), pd.to_datetime('2018-10-17')]

tweets_filter.filter_tweets(dates_range=dates_range)

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweet,retweet_count,favorite_count
621,1052343153561194496,2018-10-16 23:38:54,RT @NvidiaAI: Join us at #GTC18 in Washington ...,61559439,nvidia,False,21,0
413,1052322074461646848,2018-10-16 22:15:08,RT @RadeonPro: The AMD Radeon Pro WX 8200 is h...,14861876,AMD,False,51,0
622,1052308567175225344,2018-10-16 21:21:28,RT @NVIDIADesign: At the NVIDIA booth at #Adob...,61559439,nvidia,False,24,0
623,1052304224632037376,2018-10-16 21:04:12,RT @NVIDIADesign: Check out this tech preview ...,61559439,nvidia,False,28,0
234,1052300424009547776,2018-10-16 20:49:06,RT @BDJFuturist: At #Wired25 I sat down w/ Dr....,2803191,Intel,False,27,0
235,1052277279336751105,2018-10-16 19:17:08,"RT @WIRED: At #WIRED25, Intel's Genevieve Bell...",2803191,Intel,False,69,0
236,1052234775103062016,2018-10-16 16:28:14,Condolences from all of us at Intel to the fam...,2803191,Intel,False,61,243
237,1052212651445628928,2018-10-16 15:00:19,This new body scanner powered by #IntelRealSen...,2803191,Intel,False,27,73
238,1052070028873482240,2018-10-16 05:33:36,@Djoen45132085 Hello! Availability depends on ...,2803191,Intel,False,0,0
239,1052009310732005376,2018-10-16 01:32:19,RT @Patriots: The @Intel True View look at @ed...,2803191,Intel,False,205,0


In [19]:
tweets_filter.filter_by_date(tweets_filter.tweets_df, dates_range).shape==tweets_filter.filter_tweets(dates_range=dates_range).shape

True

Check combined filtering, reindexing and warning about a company name not being present among the Twitter handles.

In [36]:
handles_list = ['nvidia', 'AMD']
dates_range = [pd.to_datetime('2018-10-01'), pd.Timestamp.now()]
keywords_list = ['business']

tweets_filter.filter_tweets(
    companies_list=handles_list,
    dates_range=dates_range,
    keywords_list=keywords_list,
    reindex=True)

Unnamed: 0,twitter_id,created_at,text,user_id,twitter_handle,is_retweet,retweet_count,favorite_count
0,1050113879362215936,2018-10-10 20:00:33,"AMD is honored to join @Forbes' 2018 World's Best Employers! At AMD, our business is only possible thanks to our pa… https://t.co/RzWwwq2nSv",14861876,AMD,False,27,162


In [29]:
list(tweets_filter.tweets_df['twitter_handle'].unique())

['dpalmisano', 'Intel', 'AMD', 'nvidia']