In [None]:
%fs ls /FileStore/tables/curated

In [4]:
# -- coding: utf-8 --
import requests, json, datetime, boto3, itertools, tweepy
from multiprocessing.dummy import Pool as ThreadPool
from configparser import ConfigParser
import pyspark.sql.functions as F
from pyspark.sql.session import SparkSession
from pyspark.sql.types import *

In [5]:
spark = SparkSession.builder.appName('twitter_crawler').getOrCreate()
sc = spark.sparkContext

In [5]:
config = ConfigParser()
config.read('../config.ini')

# twitter keys
CONSUMER_KEY = config['TWITTER']['CONSUMER_KEY']
CONSUMER_SECRET = config['TWITTER']['CONSUMER_SECRET']
ACCESS_TOKEN = 	config['TWITTER']['ACCESS_TOKEN']
ACCESS_SECRET = config['TWITTER']['ACCESS_SECRET']

In [7]:
auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_SECRET)

api = tweepy.API(auth)

In [11]:
def get_tweets(twitter_username):
    tweets = api.user_timeline(screen_name = twitter_username, count = 100, include_rts = True, tweet_mode = 'extended')

    tweets = [tweet._json for tweet in tweets]
    tweets = list(map(lambda x: {
        'tweet' : x['full_text'],
        'dt_creation' : x['created_at'],
        'retweet_count' : x['retweet_count'],
        'favorite_count' : x['favorite_count'],
        'possibly_sensitive' : x['possibly_sensitive']
        'dt_extraction' : datetime.datetime.now()
    }, tweets))

    return tweets

In [None]:
df_twitter_accounts = spark.read.csv('/FileStore/tables/raw/twitter_accounts.csv', header=True).cache()
df_twitter_accounts.show(truncate=False)

df_twitter_accounts = df_twitter_accounts.withColumn('TWITTER_SPLIT', F.split(df_twitter_accounts.TWITTER, '/'))
df_twitter_accounts = df_twitter_accounts.withColumn('DC_TWITTER_NICKNAME', df_twitter_accounts.TWITTER_SPLIT.getItem(3))

df_twitter_accounts = df_twitter_accounts.select(
  'ID_DEPUTADO',
  'NM_PARLAMENTAR',
  'DC_TWITTER_NICKNAME'
)

In [None]:
get_tweets_udf = F.udf(lambda twitter_username: get_tweets(twitter_username), ArrayType())

politicians_twitter_df = df_twitter_accounts.withColumn('tweets', get_tweets_udf('DC_TWITTER_NICKNAME'))