# Get the Data

In [1]:
#!pip install tweepy
#!pip install tensorflow_hub
#!pip install tensorflow_text

In [11]:
import tweepy as tw
from dotenv import load_dotenv
import pandas as pd
import random 
import os

In [3]:
# For more information about Twitter API credentials please read README.md file
# Get credentials from .env file

load_dotenv()
API_KEY = os.getenv('API_KEY')
API_SECRET_KEY = os.getenv('API_SECRET_KEY')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')


# authenticate
auth = tw.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)


In [14]:
#search_query = '#seattle -filter:retweets'
test_query = '#seattle -filter:retweets'
tweets = tw.Cursor(api.search, q=test_query, lan='en').items(100)
tweets_list = []
for tweet in tweets:
    text = api.get_status(id=tweet.id, tweet_mode = 'extended').full_text

    try:
        data = [tweet.user.location, text, tweet.created_at]
        data = tuple(data)
        
        tweets_list.append(data)

    except tweepy.TweepError as e:
        print(e.reason)
        continue
    except StopIteration:
        break

    
print("Total Tweets fetched:", len(tweets_list))

Total Tweets fetched: 100


In [15]:
df = pd.DataFrame(tweets_list, columns = ['user_location','tweet_text', 'date'])
df

Unnamed: 0,user_location,tweet_text,date
0,"Seattle, WA",46F in #Seattle w light rain &amp; 9.22mph win...,2021-03-19 01:00:36
1,"West Seattle, Washington",These ✨NEW✨ crayons are great for kids and cat...,2021-03-19 01:00:04
2,"Seattle, WA",“This is a big win for grocery store employees...,2021-03-19 00:56:17
3,Seattle,Luxury penthouse is exceptionally designed and...,2021-03-19 00:53:15
4,"Seattle, WA",6pm SEATTLE HERE &amp; NOW with John Yasutake\...,2021-03-19 00:52:47
...,...,...,...
95,"Seattle, WA",SPD Asst Chief Steve Hirjak stops by Emerald C...,2021-03-18 22:00:51
96,"Seattle, WA",50F in #Seattle w light rain &amp; 11.5mph win...,2021-03-18 22:00:36
97,"Seattle, WA",Kilo is so KUTE! \n\n#gooddog #dogdaycare #dog...,2021-03-18 22:00:18
98,"New York, NY",#Seattle #Father Ambushed and #Shot 9 Times 'K...,2021-03-18 21:59:51


In [16]:
df.to_csv('tweets_df.csv')

In [17]:
df['tweet_text'][2]

'“This is a big win for grocery store employees who have been critical and vulnerable frontline workers since the start of the pandemic,” #Seattle City Attorney Pete Holmes said. https://t.co/Ya3xMqIoB4'

# Preprocess

## Split train/test/validation data

In [18]:
tweets_df = pd.read_csv('tweets_df.csv')

In [19]:
data = tweets_df['tweet_text'].copy()
data_size = len(tweets_df['tweet_text'])
print('data_size:',data_size)

train_size = int(round(data_size*0.6, 0))
test_size = int(round(data_size*0.3, 0))
validation_size = int(round(data_size*0.1, 0))

print('train_size:', train_size)
print('test_size:', test_size)
print('validation_size:', validation_size)

data_size: 100
train_size: 60
test_size: 30
validation_size: 10


In [20]:
random.shuffle(data)

train_data = list(data[:train_size])
test_data = list(data[train_size:-validation_size])
validation_data = list(data[-validation_size:])