In [1]:
import tweepy
# For sending GET requests from the API
import requests
# For saving access tokens and for file management when creating and adding to the dataset
import os
# For dealing with json responses received from the API
import json
# For displaying and managing the data
import pandas as pd
# For saving the data into a CSV format
import csv
# For parsing the dates received from Twitter into readable formats
import datetime as dt
import dateutil.parser
import unicodedata
# To add wait time between requests
import time

## My Authentication

In [2]:
my_creds = {}
with open("Exploration.txt") as file:
    for line in file:
        pieces = line.split(":")
        # print(pieces)
        my_creds[pieces[0].strip()] = pieces[1].strip()

In [None]:
my_creds

In [4]:
# Authentication method for the v2 Twitter API
my_client = tweepy.Client(my_creds["Bearer_Token"])

In [5]:
print(my_creds["API_Key"])
my_creds["API_Key_Secret"]

6Rhk1YEBmcvjJMsHOA4JbfD8u


'2HWrFuWs7CqSYDiVRR7N6K32u0Gl0IE4rU52lkS5l1lrOwzrDb'

In [6]:
# Authentication via API key and API key secret
auth = tweepy.OAuthHandler(my_creds["API_Key"], 
                           my_creds["API_Key_Secret"])

In [7]:
auth.set_access_token(my_creds["Access_Token"], 
                      my_creds["Access_Token_Secret"])

In [8]:
my_api = tweepy.API(auth)

## C4 Authentication

In [9]:
c4_creds = {}
with open("local-api-key.txt") as file:
    for line in file:
        pieces = line.split(":")
        # print(pieces)
        c4_creds[pieces[0].strip()] = pieces[1].strip()

In [None]:
c4_creds

In [11]:
# Authentication method for the v2 Twitter API
c4_client = tweepy.Client(c4_creds["Bearer"])

## Testing `search_all_tweets()`

In [12]:
handle_ids = {# "crooksandliars":14513611, 
              "CNN":759251} #, 
              # "NPR":5392522, 
              # "reason":16467567, 
              # "OANN":1209936918}

In [13]:
handle_ids.items()

dict_items([('CNN', 759251)])

In [14]:
for account, id in handle_ids.items():
    print(account)
    print(id)

CNN
759251


In [15]:
# I can use `.get()` to return the value associated with a given key
type(handle_ids.get("CNN"))

int

In [16]:
c4_client.get_user(username = "vets_v")

Response(data=<User id=112426341 name=Jimmy_hammer username=vets_v>, includes={}, errors=[], meta={})

In [17]:
my_client.get_user(username = "vets_v")

Response(data=<User id=112426341 name=Jimmy_hammer username=vets_v>, includes={}, errors=[], meta={})

In [None]:
test_query = 'from:vets_v'
start = dt.datetime.fromisoformat("2006-03-21 00:00:00")

## Current Attempts

In [18]:
# check what's inside of the handle_ids dictionary
# do this EVERY TIME before running the following cells
handle_ids

{'CNN': 759251}

In [19]:
# extract the candidate users as a list of dictionaries
# and label that list of dictionaries 
for account, id in handle_ids.items():
    done = False
    users_file = f"./{account}.csv"
    with open(users_file, encoding = "utf-8") as candidates_file:
        reader = csv.DictReader(candidates_file)
        users = [u for u in reader]

In [22]:
users[1:5]

[{'id': '190612142',
  'tweet_count': '16231',
  'created_at': '09/14/2010 11:57',
  'name': 'Abdullah Alkadi',
  'username': 'a__alkadi',
  'location': 'تميم هامة مُضر '},
 {'id': '112426341',
  'tweet_count': '5020',
  'created_at': '02/08/2010 13:13',
  'name': 'Jimmy_hammer',
  'username': 'vets_v',
  'location': 'United States of America'},
 {'id': '115079948',
  'tweet_count': '4162',
  'created_at': '02/17/2010 15:04',
  'name': 'Monsieur Sánchez',
  'username': 'raz_sanchez',
  'location': ''},
 {'id': '115879528',
  'tweet_count': '3267',
  'created_at': '02/20/2010 08:32',
  'name': 'Priyansh Narang',
  'username': 'dapperprogramer',
  'location': ''}]

## User ID Extraction

In [20]:
users[0]["id"]

'153487532'

In [None]:
# extract just the user ids from users and save user ids as a list
user_ids = []
# note that for CNN I am skipping the user at index 0 
# because they are not a valid candidate user
for i in range(1, len(users)):
    user_ids.append(users[i]["id"])

In [None]:
user_ids

## Username Extraction

In [None]:
# extract just the usernames from users and save usernames as a list
usernames = []
# note that for CNN I am skipping the user at index 0 
# because they are not a valid candidate user
for i in range(1, len(users)):
    usernames.append(users[i]["username"])

In [None]:
usernames

## User ID and Username Extraction

In [38]:
# define empty dictionary for key-value pairs
user_identifiers = {}
# extract pair of username and user id
for i in range(1, len(users)):
    user_identifiers[users[i]["username"]] = users[i]["id"]

In [None]:
# check items inside of user_identifiers
# user_identifiers.items()

In [37]:
list(user_identifiers.keys())

['a__alkadi', 'vets_v', 'raz_sanchez', 'dapperprogramer']

In [40]:
usernames = list(user_identifiers.keys())

In [41]:
for i in range(5):
    print(usernames[i])

a__alkadi
vets_v
raz_sanchez
dapperprogramer
Pussayy_Loverr


In [43]:
usernames[1]

'vets_v'

In [44]:
user_identifiers["vets_v"]

'112426341'

In [42]:
user_identifiers[usernames[1]]

'112426341'

## Data Sampling and Language Testing Loop

This section should have everything that is actually needed for validating that a potential candidate user is English speaking. 

In [None]:
temp_list = [user_ids[2]]
temp_list

In [None]:
type(temp_list)

In [57]:
import math 

backoff = 15

usernames = list(user_identifiers.keys())

starting_index = 
ending_index = 

# loop through and sample 100 tweets per candidate user
# create a dictionary to store the language testing results in
for ui in range(len(user_identifiers)):
    data = []
    done = False
    error_count = 0
    print("Iteration started. Currently on user " + str(ui) + " of " + str(len(user_identifiers)) + ".")
    while not done:
        error = False
        try:
            sample_response = my_api.user_timeline(screen_name = usernames[ui], 
                                                   count = 100, 
                                                   tweet_mode = "extended")
        except Exception as e:
            print(f"Error: {e}")
            error = True
        if error: # or (len(sample_response.errors) > 0 and sample_response.meta["result_count"] == 0):
            # print(f"{sample_response.errors[0]} and {len(sample_response)-1} others")
            # presume rate limit
            # retry three times with incremental backoff
            if error_count == 3: 
                print("There are too many errors. Bailing.")
                done = True
                break
            error_count += 1
            print(f"Sleep for {(backoff ** error_count)/60} minutes.")
            time.sleep(backoff ** error_count)
        else:
            error_count = 0
            # save the tweets from the API response to a list
            for tweet in sample_response:
                data.append([tweet.user.screen_name, tweet.full_text])
            # create a dataframe containing the sample tweets
            candidate_df = pd.DataFrame(data, columns = ["User", "Tweet"])
            # write the dataframe to a csv file where the name of the file is the username 
            # of the candidate user 
            candidate_df.to_csv(f"./{user_identifiers[usernames[ui]]}_CNN.csv", encoding = "utf-8", index = False)
            print(f"./{user_identifiers[usernames[ui]]}_CNN.csv was created.")
            done = True
        time.sleep(5)
        print("Sleep finished. New iteration starting. Currently on user " + str(ui) + " of " + str(len(user_identifiers)) + ".")

Iteration started. Currently on user 0 of 100.
./190612142_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 0 of 100.
Iteration started. Currently on user 1 of 100.
./112426341_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 1 of 100.
Iteration started. Currently on user 2 of 100.
./115079948_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 2 of 100.
Iteration started. Currently on user 3 of 100.
./115879528_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 3 of 100.
Iteration started. Currently on user 4 of 100.
./61071795_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 4 of 100.
Iteration started. Currently on user 5 of 100.
./128626589_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 5 of 100.
Iteration started. Currently on user 6 of 100.
./197482877_CNN.csv was created.
Sleep finished. New iteration sta

Error: 401 Unauthorized
Not authorized.
Sleep for 56.25 minutes.
Sleep finished. New iteration starting. Currently on user 53 of 100.
Error: 401 Unauthorized
Not authorized.
There are too many errors. Bailing.
Iteration started. Currently on user 54 of 100.
./134117934_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 54 of 100.
Iteration started. Currently on user 55 of 100.
./378727617_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 55 of 100.
Iteration started. Currently on user 56 of 100.
./34282785_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 56 of 100.
Iteration started. Currently on user 57 of 100.
./17903266_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 57 of 100.
Iteration started. Currently on user 58 of 100.
./453081155_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 58 of 100.
Iteration started. Currently on user 59 of

./262884497_CNN.csv was created.
Sleep finished. New iteration starting. Currently on user 105 of 100.
Iteration started. Currently on user 106 of 100.
./42341049_CNN.csv was created.


KeyboardInterrupt: 

CANDIDATE USER 215459654 WAS SKIPPED WHEN THE HOUR LONG SLEEP FINALLY FINISHED AND THE LOOP BAILED BECAUSE OF TOO MANY ERRORS.

USER 253780844 IS THE NEXT USER THAT NEEDS TO BE SAMPLED IN THE CNN CANDIDATE USERS FILE. 

In [None]:
# this is the chunk for reading in a given users sample tweets and testing for English
results = {}
candidate_csv = pd.read_csv(f"./{user_identifiers[usernames[1]]}.csv", encoding = "utf-8")
result = []
tweet_list = candidate_csv.Tweet.tolist()
for tweet in tweet_list:
    tweet = re.sub("\n+", " ", tweet)
    result.append(detect_en(tweet))
results[username] = sum(result)/100

In [None]:
results

In [None]:
# import libraries for English language testing
import sys
import numpy as np
import scipy
import re
import pybind11
import fasttext

In [None]:
# import model for English language testing
model = fasttext.load_model("lid.176.bin")

In [None]:
# define a function to run the language testing model 
def detect_en(x):
    try:
        return model.predict(x)[0][0] == "__label__en"
    except:
        return False

In [None]:
results = {}
result = []
test_user = pd.read_csv("vets_v.csv")

In [None]:
tweet_list = test_user.Tweet.tolist()
for tweet in tweet_list:
    tweet = re.sub("\n+", " ", tweet)
    result.append(detect_en(tweet))
results["vets_v"] = sum(result)/100

In [None]:
results

## Step-by-Step Sample 100 Tweets

This section is just for reference. Small-scale, non-automated approach to pulling 100 tweets for a specific user.

In [52]:
tweets = my_api.user_timeline(screen_name = "vets_v", 
                          count = 100, 
                          tweet_mode = "extended")

In [54]:
type(tweets)

tweepy.models.ResultSet

In [None]:
for tweet in tweets:
    print(tweet.full_text)

In [None]:
# create a dataframe to store the tweets in
columns = ["User", "Tweet"]
data = []
for tweet in tweets:
    data.append([tweet.user.screen_name, tweet.full_text])
    
df = pd.DataFrame(data, columns = columns)

In [None]:
print(df)