# Data Scraping

In [2]:
import tweepy as tw
from dotenv import load_dotenv
import pandas as pd
import random
import os
import nltk
from nltk.corpus import stopwords
import re
import string
from autocorrect import Speller 

In [4]:
# For more information about Twitter API credentials please read README.md file
# Get credentials from .env file

load_dotenv()
API_KEY = os.getenv('API_KEY')
API_SECRET_KEY = os.getenv('API_SECRET_KEY')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')


# authenticate
auth = tw.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [6]:
timeline = api.home_timeline()


tweets = []
for tweet in timeline:
    text = api.get_status(id=tweet.id, tweet_mode = 'extended').full_text
    tweets.append(text)
    home_timeline_tweets = pd.DataFrame(tweets, columns=['tweet_text'])

home_timeline_tweets

Unnamed: 0,tweet_text
0,RT @NASAJPL: Tune into the Mars Report 🔴 A reg...
1,"NCAA accused of ""disrespectful"" treatment towa..."
2,"""This is a human issue."" @danieldaekim on comi..."
3,RT @googlearts: Tune in tomorrow on #WorldStor...
4,"During my first week, I signed a memorandum to..."
5,RT @ericnamofficial: This week has been diffic...
6,11 years after the Affordable Care Act was pas...
7,".@ericnamofficial: ""If you're surprised by the..."
8,ICYMI: We hosted #Include2021 and brought toge...
9,100 million shots in arms…Hugs are coming.💕


In [8]:
home_timeline_tweets['tweet_text'][14]

'What’s going on ~250 miles above Earth? \n\n🤩 @AstroVicGlover &amp; @Astro_illini go for a spacewalk\n🌱 @Astro_Soichi completes the Asian Herbs in Space Investigation \n\nAnd @NASAEarth’s Tournament Earth begins! Get the details in #SpaceToGround: https://t.co/JX48si5UAJ https://t.co/LXgOXLW9yX'

# Data Cleaning

In [10]:
# Stopwords in English language
stopword = nltk.corpus.stopwords.words('english')

# Function to clean and tokenize the data
def clean_text(text):

    #Clear out HTML characters 
    import html
    text=html.unescape(text)

    # Replace or remove the characters 
    replacement_dict={"https?:\/\/.\S+":"", # Remove hyperlinks
                      "^RT[\s]+":"", # Remove old style retweet text "RT"
                      "\\n":"", # Remove newline caharacter
                      "@\w+.":"", # Remove mentions starting with @
                      "#":"", # Remove the hash # sign
                      "’":"'", # Replace ’ with '
                      "'s":" is", # Replace the contractions
                      "n't":" not",
                      "'m":" am",
                      "'ll":" will",
                      "'d":" would",
                      "'ve":
                      " have",
                      "'re":" are",
                      "\W+":" ", # Remove non-word caharacters
                      "^\s+":"" # Remove whitespace at the beginning
                      } 

    for item in replacement_dict.keys():
        text=re.sub(item,replacement_dict[item], text)
        
    # Separate the words 
    text = " ".join([char for char in re.split("([A-Z][a-z]+[^A-Z]*)",text) if char])

    # Replace double spaces with one
    text = re.sub("\s+"," ",text)     

    # Convert to lower case
    text = text.lower()
   
    # Spell check 
    spell = Speller(lang='en') 
    text=spell(text)  
    
    # Return text
    return text


# Apply the function to text column
home_timeline_tweets['tweet_text'] = home_timeline_tweets['tweet_text'].apply(lambda x: clean_text(x))

# Display the dataframe
home_timeline_tweets

Unnamed: 0,tweet_text
0,tune into the mars report a regular update on ...
1,ncaa accused of disrespectful treatment toward...
2,this is a human issue on coming together to co...
3,tune in tomorrow on world story telling day fo...
4,during my first week i signed a memorandum to ...
5,this week has been difficult for many of us i ...
6,11 years after the affordable care act was pas...
7,if you are surprised by the anti asian violenc...
8,ichi we hosted include2021 and brought togethe...
9,100 million shots in arms hugs are coming


In [17]:
home_timeline_tweets['tweet_text'][14]

'what is going on 250 miles above earth go for a spacewalk completes the asian herbs in space investigation and s tournament earth begins get the details in space to ground '