# @robotodio

---

## Libraries

In [7]:
# Python libraries
# ------------------------------------------------------------------------------
# Enviroment
import os
from sinfo import sinfo

# Performance
from time import time

# Reading files with different formats
import json

# Data wrangling
import re
import pandas as pd
import numpy as np

# Data Visualitation
import seaborn as sns
import matplotlib.pyplot as plt

# Twitter API
import tweepy

# Hate speech detection
from detoxify import Detoxify

sinfo()

-----
detoxify    NA
matplotlib  3.3.2
numpy       1.19.2
pandas      1.2.0
seaborn     0.11.1
sinfo       0.3.1
tweepy      3.10.0
-----
IPython             7.19.0
jupyter_client      6.1.7
jupyter_core        4.7.0
jupyterlab          2.2.6
notebook            6.1.6
-----
Python 3.8.5 (default, Sep  3 2020, 21:29:08) [MSC v.1916 64 bit (AMD64)]
Windows-10-10.0.19041-SP0
4 logical CPU cores, Intel64 Family 6 Model 61 Stepping 4, GenuineIntel
-----
Session information updated at 2021-03-23 18:15


## Objective

The objective of this notebook is to obtain the optimal number of tweets to obtain a representative sample of data, without affecting the performance of the code.

To analyze this, the functional part of the code will be replicated within a for loop, which will be executed for different numbers of items downloaded and processed.

In [3]:
# API Twitter credentials
# ------------------------------------------------------------------------------
# Open .json file containing credentials/tokens as a dictionary
with open("../twitter_api_keys.json") as file:
    api_credentials = json.load(file)
    
# Assign each value of the dictionary to a new variable
consumer_key = api_credentials['consumer_key']
consumer_secret = api_credentials['consumer_secret']
access_token = api_credentials['access_token']
access_token_secret = api_credentials['access_token_secret']

In [4]:
# API set up
# ------------------------------------------------------------------------------
# Create a handler instance with key and secret consumer, and pass the tokens
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
    
# Construct the API instance
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# Check credentials
if(api.verify_credentials):
    print('-'*30)
    print("Logged In Successfully.")
    print('-'*30)
else:
    print("Error -- Could not log in with your credentials.")

------------------------------
Logged In Successfully.
------------------------------


In [5]:
# Tweets list iterator
# ------------------------------------------------------------------------------
def tweets_iterator(target, n_items):
    '''
    Returns an iterator of tweets.

        Parameters:
            target (str): The user name of the Twitter account.
            n_items (int): Number of tweets downloaded.

        Returns:
            tweets (ItemIterator): an iterator of tweets.
    '''
    # Instantiate the iterator
    tweets = tweepy.Cursor(
        api.user_timeline,
        screen_name=target,
        include_rts=False,
        exclude_replies=False,
        tweet_mode='extended').items(n_items)
    
    # Returns iterator
    return tweets

Once we are authenticated, and we have defined a function to download tweets, we run the loop:

In [30]:
target = 'DFD_74'

for i, n_items in enumerate([10, 50, 100, 250, 500, 750, 1000]):
    
    # Performance variables
    initial_time = time() 
    test = 'test' + str(i)
    
    # Tweet extractor
    # ------------------------------------------------------------------------------
    # Tweets list (iterator)
    tweets = tweets_iterator(target, n_items)
    # Read through the iterator, and export the info to a Pandas DataFrame
    all_columns = [np.array([
        tweet.full_text,
        tweet.user.screen_name,
        tweet.id,
        tweet.user.followers_count,
        tweet.source,
        tweet.created_at,
        tweet.lang,
        len(tweet.full_text),
        tweet.favorite_count,
        tweet.retweet_count,
        re.findall(r"#(\w+)", tweet.full_text)
    ]) for tweet in tweets]
    # Export the list of tweets to a dataframe
    df = pd.DataFrame(
        data=all_columns,
        columns=['tweet', 'account', 'id', 'followers', 'source', 'date', 'language',
                 'length', 'likes', 'RTs', 'hashtags'])
    
    
    # Data cleaning
    # ------------------------------------------------------------------------------
    # Characters to remove
    spec_chars = ['\n', '\t', '\r']
    # Replace defined characters with a whitespace
    for char in spec_chars:
        df['tweet'] = df['tweet'].str.strip().replace(char, ' ')
    # Split and re-join each tweet
    df['tweet'] = df['tweet'].str.split().str.join(" ")
    
    
    # Hate speech level prediction
    # ------------------------------------------------------------------------------
    # Returns a dictionary with toxicity values of each tweet. The key of the
    # dictionary is called toxicity.
    results = Detoxify('multilingual').predict(list(df['tweet']))
    # Add the new info to the previous DataFrame
    df['toxicity'] = results['toxicity']
    # Define a class for each tweet
    df['class'] = df['toxicity'].apply(lambda toxicity: 'toxic' if toxicity >= 0.5 else 'non-toxic')
    # Mean scoring
    scoring_average = df['toxicity'].mean()
    
    # Performance variables
    final_time = time() 
    execution_time = final_time - initial_time
    
    results_tuple = ('test' + str(n_items),
                     'scoring_average = ' + str(scoring_average),
                     'execution_time = ' + str(execution_time))
    
    print(results_tuple)

  all_columns = [np.array([


('test10', 'scoring_average = 0.27288484579185024', 'execution_time = 10.773355722427368')
('test50', 'scoring_average = 0.22765086456143763', 'execution_time = 18.008984088897705')
('test100', 'scoring_average = 0.23138363264180953', 'execution_time = 28.813783168792725')
('test250', 'scoring_average = 0.20244164427777286', 'execution_time = 59.9163134098053')
('test500', 'scoring_average = 0.20519548592192585', 'execution_time = 111.97544741630554')
('test750', 'scoring_average = 0.23731756798484518', 'execution_time = 202.84372735023499')
('test1000', 'scoring_average = 0.25159539312068957', 'execution_time = 276.5613822937012')
