# Data Scraping

In [1]:
import tweepy as tw
from dotenv import load_dotenv
import pandas as pd
import random
import os
import nltk
from nltk.corpus import stopwords
import re
import string
from autocorrect import Speller 

In [2]:
# For more information about Twitter API credentials please read README.md file
# Get credentials from .env file

load_dotenv()
API_KEY = os.getenv('API_KEY')
API_SECRET_KEY = os.getenv('API_SECRET_KEY')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')


# authenticate
auth = tw.OAuthHandler(API_KEY, API_SECRET_KEY)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tw.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [3]:
timeline = api.home_timeline(count=200)


tweets = []
for tweet in timeline:
    text = api.get_status(id=tweet.id, tweet_mode = 'extended', lan='en').full_text
    tweets.append(text)
    home_timeline_tweets = pd.DataFrame(tweets, columns=['tweet_text'])

home_timeline_tweets.head()

Unnamed: 0,tweet_text
0,Americans should never have to fear discrimina...
1,"Eleven years ago, the Affordable Care Act beca..."
2,One year ago we had to close the West Seattle ...
3,RT @erikashimizu: I'm building a referral list...
4,Programmed 3 hours of experimental movies for ...


In [4]:
home_timeline_tweets['tweet_text'][14]

'RT @ehimeora: You took care of everybody else, it’s now time to take care of yourself too.'

# Data Cleaning

In [5]:
# Stopwords in English language
stopword = nltk.corpus.stopwords.words('english')

# Function to clean and tokenize the data
def clean_text(text):

    #Clear out HTML characters 
    import html
    text=html.unescape(text)

    # Replace or remove the characters 
    replacement_dict={"https?:\/\/.\S+":"", # Remove hyperlinks
                      "^RT[\s]+":"", # Remove old style retweet text "RT"
                      "\\n":"", # Remove newline caharacter
                      "@\w+.":"", # Remove mentions starting with @
                      "#":"", # Remove the hash # sign
                      "’":"'", # Replace ’ with '
                      "'s":" is", # Replace the contractions
                      "n't":" not",
                      "'m":" am",
                      "'ll":" will",
                      "'d":" would",
                      "'ve":
                      " have",
                      "'re":" are",
                      "\W+":" ", # Remove non-word caharacters
                      "^\s+":"" # Remove whitespace at the beginning
                      } 

    for item in replacement_dict.keys():
        text=re.sub(item,replacement_dict[item], text)
        
    # Separate the words 
    text = " ".join([char for char in re.split("([A-Z][a-z]+[^A-Z]*)",text) if char])

    # Replace double spaces with one
    text = re.sub("\s+"," ",text)     

    # Convert to lower case
    text = text.lower()
   
    # Spell check 
    spell = Speller(lang='en') 
    text=spell(text)  
    
    # Return text
    return text


# Apply the function to text column
home_timeline_tweets['tweet_text'] = home_timeline_tweets['tweet_text'].apply(lambda x: clean_text(x))

# Display the dataframe
home_timeline_tweets.head()

Unnamed: 0,tweet_text
0,americans should never have to fear discrimina...
1,eleven years ago the affordable care act becam...
2,one year ago we had to close the west seattle ...
3,i am building a referral list of vetted read n...
4,programmed 3 hours of experimental movies for ...


In [6]:
home_timeline_tweets['tweet_text'][14]

'you took care of everybody else it is now time to take care of yourself too '

In [7]:
home_timeline_tweets.head(10)

Unnamed: 0,tweet_text
0,americans should never have to fear discrimina...
1,eleven years ago the affordable care act becam...
2,one year ago we had to close the west seattle ...
3,i am building a referral list of vetted read n...
4,programmed 3 hours of experimental movies for ...
5,our ingenuity mars helicopter has to meet a se...
6,compliments resonate with me because i am 22 a...
7,remember when i got that one dm request callin...
8,but i am doing this i am all in something good...
9,i spoke to the beauty brand consultant she was...


In [8]:
home_timeline_tweets.to_csv('home_timeline_tweets.csv', index=False)