# Data Scraping

In [1]:
import tweepy as tw
from dotenv import load_dotenv
import pandas as pd
import random
import os
import nltk
from nltk.corpus import stopwords
import re
import string
from autocorrect import Speller 

In [2]:
# For more information about Twitter API credentials please read README.md file
# Get credentials from .env file

load_dotenv()
API_KEY = os.getenv('API_KEY')
API_SECRET_KEY = os.getenv('API_SECRET_KEY')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')


# authenticate
auth = tw.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [3]:
timeline = api.home_timeline(count=1000)


tweets = []
for tweet in timeline:
    text = api.get_status(id=tweet.id, tweet_mode = 'extended', lan='en').full_text
    tweets.append(text)
    home_timeline_tweets = pd.DataFrame(tweets, columns=['tweet_text'])

home_timeline_tweets.head()

Unnamed: 0,tweet_text
0,RT @TheCodePixi: Casual reminder that mental h...
1,That’s a wrap 🙌🏽\n\nA big thank you to our spe...
2,RT @cpamzhang: I want to say to my fellow Asia...
3,Seattle stands together to #StopAsianHate &amp...
4,"RT @JulieAFortin: 1/ In @ScienceMagazine, ther..."


In [4]:
home_timeline_tweets['tweet_text'][14]

"New milestone: 1,702 donors to the Data Science Curriculum Pledge Drive. 👏\n\nWe'll use these funds to bring on experienced math and CS teachers to design these Jupyter Notebook projects.\n\nAll donations will be 100% matched by @darrellsilver.\n\nLearn more: https://t.co/kClO5NUFJQ https://t.co/u5N4YYcWvR"

# Data Cleaning

In [5]:
# Stopwords in English language
stopword = nltk.corpus.stopwords.words('english')

# Function to clean and tokenize the data
def clean_text(text):

    #Clear out HTML characters 
    import html
    text=html.unescape(text)

    # Replace or remove the characters 
    replacement_dict={"https?:\/\/.\S+":"", # Remove hyperlinks
                      "^RT[\s]+":"", # Remove old style retweet text "RT"
                      "\\n":"", # Remove newline caharacter
                      "@\w+.":"", # Remove mentions starting with @
                      "#":"", # Remove the hash # sign
                      "’":"'", # Replace ’ with '
                      "'s":" is", # Replace the contractions
                      "n't":" not",
                      "'m":" am",
                      "'ll":" will",
                      "'d":" would",
                      "'ve":
                      " have",
                      "'re":" are",
                      "\W+":" ", # Remove non-word caharacters
                      "^\s+":"" # Remove whitespace at the beginning
                      } 

    for item in replacement_dict.keys():
        text=re.sub(item,replacement_dict[item], text)
        
    # Separate the words 
    text = " ".join([char for char in re.split("([A-Z][a-z]+[^A-Z]*)",text) if char])

    # Replace double spaces with one
    text = re.sub("\s+"," ",text)     

    # Convert to lower case
    text = text.lower()
   
    # Spell check 
    spell = Speller(lang='en') 
    text=spell(text)  
    
    # Return text
    return text


# Apply the function to text column
home_timeline_tweets['tweet_text'] = home_timeline_tweets['tweet_text'].apply(lambda x: clean_text(x))

# Display the dataframe
home_timeline_tweets.head()

Unnamed: 0,tweet_text
0,casual reminder that mental health days are va...
1,that is a wrap a big thank you to our speakers...
2,i want to say to my fellow asian americans who...
3,seattle stands together to stop asian hate end...
4,1 in there has been a marked bias against wome...


In [6]:
home_timeline_tweets['tweet_text'][14]

'new milestone 1 702 donors to the data science curriculum pledge drive we will use these funds to bring on experienced math and cs teachers to design these jupiter notebook projects all donations will be 100 matched by learn more '

In [7]:
home_timeline_tweets.head(10)

Unnamed: 0,tweet_text
0,casual reminder that mental health days are va...
1,that is a wrap a big thank you to our speakers...
2,i want to say to my fellow asian americans who...
3,seattle stands together to stop asian hate end...
4,1 in there has been a marked bias against wome...
5,one of the best things that happened last year...
6,alert misinformation is circulating about our ...
7,yup this is me
8,a harm against any one of us is a harm against...
9,me posts hot sauce making poetry interior desi...


In [8]:
home_timeline_tweets.to_csv('home_timeline_tweets.csv', index=False)