## Acquire and Analyze

Utilizing the dataset share Twitter accounts, I ran scraped the data a second time, this time looking for tweets that include the hashtag #thornsfc. Using these tweets, I will then place it to a dataframe to analyze their information in python (but also allow it to be looked at in SQL) and finally add them to their own .csv file. Using user location, I will build a word cloud that depicts the locations of where users are tweeting about #thornsfc. It is interesting to see if majority of users are tweeting from portland, OR or Oregon in general, as this is where the Thorns FC soccer team is located, or if users are coming from various places across the northwest region, or even if they are being tweeted about in rivals citys such as Orlando, Seattle, and Los Angeles.

In [None]:
import datetime
import tweepy
import os
import pandas as pd
import matplotlib.pyplot as plt 
from wordcloud import WordCloud, STOPWORDS 

from My_API_Keys import api_key, api_key_secret, access_token, access_token_secret

In [None]:
# Authenticate the Tweepy API
auth = tweepy.OAuthHandler(api_key,api_key_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth,wait_on_rate_limit=True)

In [None]:
date_since = "2019-01-01"
search_words = "#ThornsFC" + '-filter:retweets'

In [None]:
#created a variable to store the api search rather for future calls to pull data below and change up the output.

tweets = tweepy.Cursor(api.search,
               # tweet_mode is defaulted to short, which only holds the first 140 characters of a Tweet.
               tweet_mode='extended',
               q=search_words,
               lang='en', since = date_since).items()

In [None]:
#list comprehension to pull date, user, location, and tweet

user_info = [[item.created_at,item.user.screen_name, item.user.location, item.full_text] for item in tweets]

In [None]:
#placed the user info into a dataframe for future analyzing within SQL or python

tweet_text = pd.DataFrame(data=user_info, columns=['Date', 'User', 'Location', 'Tweet'])

In [None]:
tweet_text

In [None]:
tweet_text.to_csv('thorn_tweets.csv')

In [None]:
tweet_text['Location'].value_counts()[:30]

In [None]:
# Read file  
df = pd.read_csv(r'thorn_tweets.csv', encoding ="latin-1") 
  
comment_words = '' 
stopwords = set()

In [None]:
# iterate through the text file 
for val in df: 
      
    # typecaste each val to string 
    val = str(val) 
  
    # split the value 
    tokens = val.split() 
      
    # Converts each token into lowercase 
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = stopwords, 
                    min_font_size = 10).generate(comment_words)

In [None]:
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 

### Grab Followers IDs - from twitter API assignment 

In [None]:
# I'm putting the handles in a list to iterate through below
team_handles = ['ThornsFC', 'ORLPride']


# This will iterate through each Twitter handle that we're collecting from
for screen_name in team_handles:
    
    # Tells Tweepy we want information on the handle we're collecting from
    # The next line specifies which information we want, which in this case is the number of followers 
    user = api.get_user(screen_name) 
    followers_count = user.followers_count

    # Let's see roughly how long it will take to grab all the follower IDs. 
    print(f'''
    @{screen_name} has {followers_count} followers. 
    That will take roughly {followers_count/(5000*60):.0f} hours and {followers_count/(5000):.2f} minutes
    ''')

In [None]:
# This creates a dictionary containing a list for each Twitter handle we'll be grabbing follower IDs from
id_dict = {'ThornsFC' : [],
           'ORLPride' : []}

# Grabs the time when we start making requests to the API
start_time = datetime.datetime.now()

# .keys() allows us to iterate through each key in the dictionary
for handle in id_dict.keys():
    
    # Each page contains 5,000 records, so since we know there are much more than 5,000 followers for both
    # the Thorns and Pride, we must iterate through each of the pages in order to get all follower IDs
    # To grab the follower IDs, we will be using followers_ids
    for page in tweepy.Cursor(api.followers_ids,
                              # This is how we will get around the issue of not being able to grab all ids at once
                              # Once the rate limit is hit, we will be notified that we must wait 15 mins (900 secs)
                              wait_on_rate_limit=True, wait_on_rate_limit_notify=True, compression=True,
                              screen_name=handle).pages():

        # The page variable comes back as a list, so we have to use .extend rather than .append
        id_dict[handle].extend(page)
        

# Let's see how long it took to grab all follower IDs
end_time = datetime.datetime.now()
elapsed_time = end_time - start_time
print(elapsed_time)

In [None]:
id_dict['ThornsFC'][:10]

In [None]:
users = id_dict['ORLPride'][:10]

for name in users:
    
    user = api.get_user(name)
    print(user.screen_name)

# Grab descriptions based on the followers IDs

In [None]:
headers = ['screen_name', 'name', 'location', 'followers_count', 'friends_count', 'description']

for team in id_dict.keys():
    
    # Descriptions with emoji or non-Roman letters can cause trouble. Encoding your .txt file in utf-8 will help
    with open(f'{team}_followers.txt','w', encoding='utf-8') as out_file:
        out_file.write('\t'.join(headers) + '\n')

        for idx, ids in enumerate(id_dict[team]):
            
            # For accounts set to private, we won't be able to get the description unless we follow them
            # Putting in a try/except statement, we can get around this issue.
            try:
                user = api.get_user(ids)
                name = user.name
                location = user.location
                followers_count = user.followers_count
                friends_count = user.friends_count
                description = str(user.description).replace('\t',' ').replace('\n',' ')
                outline = [user.screen_name, user.name, user.location, user.followers_count, 
                           user.friends_count, user.description]
                
                out_file.write('\t'.join([str(item) for item in outline]) + '\n')
                
            except:
                continue