# Getting Datasets

In [None]:
# Dependencies
import tweepy
import random
import time
import json
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint

# Twitter API Keys
from config import (consumer_key, 
                    consumer_secret, 
                    access_token, 
                    access_token_secret)
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

###

# scan timeline and append unique screen names to our list of 500 'normal' twitter users

normal_tweeters = []

while len(normal_tweeters)<=500:
    normals = api.search('today',count=100,lang='en')
    normal_tweets = normals['statuses']
    for status in normal_tweets:
        if status['user']['screen_name'] not in normal_tweeters:
            if status['user']['followers_count']<1500 and\
            status['user']['followers_count']>15 and\
            status['user']['statuses_count']>100 and\
            status['user']['statuses_count']<20000:
                normal_tweeters.append(status['user']['screen_name'])
    time.sleep(30)

# convert to pandas DF, save to CSV
normal_users_df = pd.DataFrame({
    'Screen Name':normal_tweeters
})
normal_users_df.to_csv('normal_user_names.csv')

###

# generate a basic summary table for a sample of 100 users
num_followers = []
num_following = []
statuses_count = []
test_users = random.sample(normal_tweeters,100)

for tweeter in test_users:
    profile = api.get_user(tweeter)
    num_followers.append(profile['followers_count'])
    num_following.append(profile['friends_count'])
    statuses_count.append(profile['statuses_count'])
    
###

# convert lists into a pandas DF, save to CSV
normal_summ = pd.DataFrame({
    'Screen Name':test_users,
    'Number of Followers':num_followers,
    'Number Following':num_following,
    'Number of Statuses':statuses_count
})
normal_summ.to_csv('normal_user_summary.csv')

###

# ------ Get data for our 'fitness users' ------
# Target Fitness Hashtags
target_tags = ["#nopainnogain", "#cardio", "#cycling", "#fitspo", "#exercise", "#gym", 
               "#fitfam", "#fitlife","#fitness", "#fitnessaddict", "#gymlife", "#gymrat", 
              "#gymtime", "#sweat", "#workout", "#marathon", "#runners", "#fit"
              "#bodybuilding", "#beachbody", "#motivation", "#justdoit", "#TrainHard", "#GetFit"]
fitness_user_accounts = {}

# "Real Person" Filters
min_tweets = 100
max_tweets = 20000
max_followers = 1500
min_following = 50
lang = "en"
    
#Loop through the hashtags  
for tag in target_tags: 
    
    # Variable for holding the oldest tweet
    oldest_tweet=None
    
    # Loop through target tags
    for x in range(7):
        
        public_tweets = api.search(tag, count=100, result_type="recent", max_id=oldest_tweet)
    #   pprint(public_tweets)
    
        # Loop through all tweets
        for tweet in public_tweets["statuses"]:
        
            # Find the screen name
            user = tweet["user"]["screen_name"]
        
            # Define whether user is a 'real' person or not
            if( 
                tweet["user"]["lang"] == "en" and 
                "gym" not in tweet["user"]["description"] and
                tweet["user"]["followers_count"] < max_followers and 
                tweet["user"]["statuses_count"] > min_tweets and 
                tweet["user"]["statuses_count"] < max_tweets and 
                tweet["user"]["friends_count"] > min_following):
            
                    # Add screen name to user list
                    if(user not in fitness_user_accounts):
                        fitness_user_accounts[user] = 1
                    
                    # If it already exists add 1 to its count
                    else:
                        fitness_user_accounts[user] += 1 
    
       

print(fitness_user_accounts)

###
# Convert user_accounts object into a series
fitness_user_accounts_pd = pd.Series(fitness_user_accounts)

fitness_user_accounts_pd.sort_values(ascending=False)

# Export the list

fitness_user_accounts_pd.to_csv("fitness_users2.csv", encoding='utf-8')


# Reading in Fitness User Group 

In [None]:
# Dependencies
import tweepy
import json
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from config import consumer_key, consumer_secret, access_token, access_token_secret
from pprint import pprint
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from pprint import pprint

####

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

###

# Read in merged dataset 
file = "datasets/merged_lists_032118.csv"

df = pd.read_csv(file)

df.head()

normal_users = df["normal_users"].tolist()
fitness_users = df["fitness_users"].tolist()
# normal_users
# fitness_users

#########
# Set target twitter accounts 
targetnormal_users = normal_users 
targetfitness_users = fitness_users

# Lists to hold user accounts, tweets, dates, & sentiments
user_acct = []
tweet_txt = []
tweet_dt =[]
# Vader lists 
compound_list = []
positive_list = []
negative_list = []
neutral_list = []

########

# Loop through all fitness users 
for target in targetfitness_users:
    
    try:

        # Get tweets for one page for each user (20 tweets)
        public_tweets = api.user_timeline(target)

        # Loop through all tweets
        for tweet in public_tweets:

            # Run Vader Analysis on each tweet
            results = analyzer.polarity_scores(tweet["text"])
            compound = results["compound"]
            pos = results["pos"]
            neu = results["neu"]
            neg = results["neg"]

            # Add each value to the appropriate list 
            user_acct.append(target)
            tweet_txt.append(tweet["text"])
            tweet_dt.append(tweet["created_at"])

            compound_list.append(compound)
            positive_list.append(pos)
            negative_list.append(neg)
            neutral_list.append(neu)

    except tweepy.error.TweepError:
        print(f'Sorry, {target} not found. Skipping.')
        continue


In [None]:
# Convert tweet timestamps to datetime objects
converted_timestamps = []
for dt in tweet_dt:
    converted_time = datetime.strptime(dt, "%a %b %d %H:%M:%S %z %Y")
    converted_timestamps.append(converted_time)
    
# Confirm length of list 
print(len(converted_timestamps))

In [None]:
# Create dataframe for all accounts and tweets
df_fit = pd.DataFrame({"Account":user_acct,
                   "Tweet Text":tweet_txt,
                   "Date":converted_timestamps,
                   "Compound":compound_list,
                   "Positive":positive_list,
                   "Negative":negative_list,
                   "Neutral":neutral_list
                  })
df_fit.head()
# Reorder columns 
df_fit2 = df_fit[['Account', 'Date', 'Tweet Text', 'Compound', 'Positive', 'Neutral', 'Negative']]
df_fit2.head()

In [None]:
#Save df to csv
df_fit2.to_csv("datasets/fit_tweets.csv",index=False,encoding='utf-8')

# Reading in Normal User Group 

In [None]:
# Dependencies
import tweepy
import json
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from config import consumer_key, consumer_secret, access_token, access_token_secret
from pprint import pprint
# Import and Initialize Sentiment Analyzer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
from pprint import pprint

# Setup Tweepy API Authentication
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, parser=tweepy.parsers.JSONParser())

# Read in merged dataset 
file = "datasets/merged_lists_032118.csv"

df = pd.read_csv(file)

df.head()


normal_users = df["normal_users"].tolist()
fitness_users = df["fitness_users"].tolist()
# normal_users
# fitness_users

# Set target twitter accounts 
targetnormal_users = normal_users 
targetfitness_users = fitness_users

# Lists to hold user accounts, tweets, dates, & sentiments
user_acct = []
tweet_txt = []
tweet_dt =[]
# Vader lists 
compound_list = []
positive_list = []
negative_list = []
neutral_list = []



In [None]:
# Loop through all normal users 
for target in targetnormal_users:
    
    # Loop through once (20 tweets)

    # Get tweets for one page for each user (20 tweets)
    try:
        public_tweets = api.user_timeline(target)

        # Loop through all tweets
        for tweet in public_tweets:

            # Run Vader Analysis on each tweet
            results = analyzer.polarity_scores(tweet["text"])
            compound = results["compound"]
            pos = results["pos"]
            neu = results["neu"]
            neg = results["neg"]

            # Add each value to the appropriate list 
            user_acct.append(target)
            tweet_txt.append(tweet["text"])
            tweet_dt.append(tweet["created_at"])

            compound_list.append(compound)
            positive_list.append(pos)
            negative_list.append(neg)
            neutral_list.append(neu)
    except tweepy.error.TweepError:
        print(f'Sorry, {target} not found. Skipping.')
        continue

In [None]:
# Convert tweet timestamps to datetime objects
converted_timestamps = []
for dt in tweet_dt:
    converted_time = datetime.strptime(dt, "%a %b %d %H:%M:%S %z %Y")
    converted_timestamps.append(converted_time)
    
# Confirm length of list 
print(len(converted_timestamps))

In [None]:
# Create dataframe for all accounts and tweets
df_norm = pd.DataFrame({"Account":user_acct,
                   "Tweet Text":tweet_txt,
                   "Date":converted_timestamps,
                   "Compound":compound_list,
                   "Positive":positive_list,
                   "Negative":negative_list,
                   "Neutral":neutral_list
                  })
df_norm.head()
# Reorder columns 
df_norm2 = df_norm[['Account', 'Date', 'Tweet Text', 'Compound', 'Positive', 'Neutral', 'Negative']]
df_norm2.head()

#Save df to csv
df_norm2.to_csv("datasets/norm_tweets1.csv",encoding='utf-8',index=False)