# Gather data

In [1]:
import pandas as pd
from multiprocessing import Pool
import time
from twitterscraper.query import query_user_info, query_tweets_from_user

INFO: {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13'}


In [2]:
def get_user_info(twitter_user):
    """
    An example of using the query_user_info method
    :param twitter_user: the twitter user to capture user data
    :return: twitter_user_data: returns a dictionary of twitter user data
    """
    user_info = query_user_info(user= twitter_user)
    twitter_user_data = {}
    twitter_user_data["user"] = user_info.user
    twitter_user_data["fullname"] = user_info.full_name
    twitter_user_data["location"] = user_info.location
    twitter_user_data["blog"] = user_info.blog
    twitter_user_data["date_joined"] = user_info.date_joined
    twitter_user_data["id"] = user_info.id
    twitter_user_data["num_tweets"] = user_info.tweets
    twitter_user_data["following"] = user_info.following
    twitter_user_data["followers"] = user_info.followers
    twitter_user_data["likes"] = user_info.likes
    twitter_user_data["lists"] = user_info.lists
    
    return twitter_user_data

In [3]:
%%time

start = time.time()
users = ['ProtasiewiczJ', 'RozeckaPL', 'K_Smiszek', 'TudujKrzysztof', 'SchetynadlaPO']

twitter_user_info=[] 
for user in users:
    info = get_user_info(user)
    twitter_user_info.append(info)

cols=['id','fullname','date_joined','location','blog', 'num_tweets','following','followers','likes','lists']
data_frame = pd.DataFrame(twitter_user_info, index=users, columns=cols)
data_frame.index.name = "Users"
data_frame.sort_values(by="followers", ascending=False, inplace=True, kind='quicksort', na_position='last')
elapsed = time.time() - start
print(f"Elapsed time: {elapsed}")
print(data_frame)

INFO: Using proxy 195.211.174.36:56746


start


INFO: Got user information from username ProtasiewiczJ
INFO: Using proxy 145.239.169.40:1080
INFO: Got user information from username RozeckaPL
INFO: Using proxy 209.90.63.108:80
INFO: Got user information from username K_Smiszek
INFO: Using proxy 178.75.21.109:54137
INFO: Got user information from username TudujKrzysztof
INFO: Using proxy 86.123.166.109:8080
INFO: Got user information from username SchetynadlaPO


Elapsed time: 4.629264831542969
                                id              fullname          date_joined  \
Users                                                                           
SchetynadlaPO           4267404640     Grzegorz Schetyna  12:51 - 24 lis 2015   
ProtasiewiczJ            620182875    Jacek Protasiewicz  08:42 - 27 cze 2012   
K_Smiszek               3025432383     Krzysztof Śmiszek   11:55 - 8 lut 2015   
RozeckaPL               2205387775  M.Stachowiak Różecka  10:30 - 20 lis 2013   
TudujKrzysztof  889250035782885377       Krzysztof Tuduj  15:25 - 23 lip 2017   

                      location                          blog  num_tweets  \
Users                                                                      
SchetynadlaPO                                    schetyna.pl        2046   
ProtasiewiczJ                          jacek-protasiewicz.pl       10397   
K_Smiszek              Wrocław            wiosnabiedronia.pl        5639   
RozeckaPL           

In [17]:
tweets_per_user = []

for user in users:
    tweets = query_tweets_from_user(user)
    tweets_per_user.append(tweets)

INFO: Scraping tweets from https://twitter.com/ProtasiewiczJ
INFO: Using proxy 182.252.252.57:21946
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1182597735406542849&reset_error_state=false
INFO: Using proxy 49.236.190.66:22691
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1182537325827559424&reset_error_state=false
INFO: Using proxy 49.238.205.158:20375
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1182367878961680387&reset_error_state=false
INFO: Using proxy 113.30.90.151:21368
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1182074187701207040&reset_error_state

INFO: Using proxy 223.26.191.166:23391
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1151614128898924545&reset_error_state=false
INFO: Using proxy 191.7.209.222:50626
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1151227329634418688&reset_error_state=false
INFO: Using proxy 182.252.246.67:21940
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1149800202922024961&reset_error_state=false
INFO: Using proxy 223.26.240.204:22645
INFO: Scraping tweets from https://twitter.com/i/profiles/show/ProtasiewiczJ/timeline/tweets?include_available_features=1&include_entities=1&max_position=1149678836734447616&reset_error_state=false
INFO: Using proxy 223.26.160.176:22545
INFO: Twitter 

INFO: Using proxy 106.10.70.224:21681
INFO: Scraping tweets from https://twitter.com/i/profiles/show/RozeckaPL/timeline/tweets?include_available_features=1&include_entities=1&max_position=403231115197956096&reset_error_state=false
INFO: Using proxy 113.30.122.55:20552
INFO: Twitter returned : 'has_more_items' 
INFO: Got 631 tweets from username RozeckaPL
INFO: Scraping tweets from https://twitter.com/K_Smiszek
INFO: Using proxy 180.210.115.219:20612
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1183639993325105153&reset_error_state=false
INFO: Using proxy 106.10.65.19:21676
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1182257320904085505&reset_error_state=false
INFO: Using proxy 110.93.163.194:22867
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/

INFO: Using proxy 85.130.11.89:51229
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1122425266477969408&reset_error_state=false
INFO: Using proxy 49.238.165.68:23557
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1121673515889860609&reset_error_state=false
INFO: Using proxy 106.10.91.8:21305
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1119528155012898817&reset_error_state=false
INFO: Using proxy 106.10.127.146:21267
INFO: Scraping tweets from https://twitter.com/i/profiles/show/K_Smiszek/timeline/tweets?include_available_features=1&include_entities=1&max_position=1117034940812668929&reset_error_state=false
INFO: Using proxy 49.238.147.71:23528
INFO: Scraping tweets from https://t

INFO: Using proxy 180.210.119.97:20218
INFO: Scraping tweets from https://twitter.com/i/profiles/show/TudujKrzysztof/timeline/tweets?include_available_features=1&include_entities=1&max_position=1105054172876075008&reset_error_state=false
INFO: Using proxy 182.252.247.182:20743
INFO: Scraping tweets from https://twitter.com/i/profiles/show/TudujKrzysztof/timeline/tweets?include_available_features=1&include_entities=1&max_position=1102689680871571461&reset_error_state=false
INFO: Using proxy 223.26.139.24:22513
INFO: Scraping tweets from https://twitter.com/i/profiles/show/TudujKrzysztof/timeline/tweets?include_available_features=1&include_entities=1&max_position=1101462385376854017&reset_error_state=false
INFO: Using proxy 106.10.4.165:22854
INFO: Scraping tweets from https://twitter.com/i/profiles/show/TudujKrzysztof/timeline/tweets?include_available_features=1&include_entities=1&max_position=1098513936150941697&reset_error_state=false
INFO: Using proxy 182.252.245.94:20343
INFO: Scrap

INFO: Using proxy 191.102.85.221:9000
INFO: Scraping tweets from https://twitter.com/i/profiles/show/SchetynadlaPO/timeline/tweets?include_available_features=1&include_entities=1&max_position=1053386013324857344&reset_error_state=false
INFO: Using proxy 103.205.128.65:43037
INFO: Scraping tweets from https://twitter.com/i/profiles/show/SchetynadlaPO/timeline/tweets?include_available_features=1&include_entities=1&max_position=1052147515578044416&reset_error_state=false
INFO: Using proxy 180.210.114.182:20215
INFO: Scraping tweets from https://twitter.com/i/profiles/show/SchetynadlaPO/timeline/tweets?include_available_features=1&include_entities=1&max_position=1049011947314790401&reset_error_state=false
INFO: Using proxy 223.26.208.84:22613
INFO: Scraping tweets from https://twitter.com/i/profiles/show/SchetynadlaPO/timeline/tweets?include_available_features=1&include_entities=1&max_position=1044923189611122689&reset_error_state=false
INFO: Using proxy 223.26.194.90:22595
INFO: Scraping 

In [18]:
len(tweets_per_user)

5

In [20]:
for user in tweets_per_user:
    print(len(user))
    print(user[-1].timestamp)

762
2019-07-12 13:56:09
631
2013-11-20 18:39:24
818
2019-04-04 11:41:24
788
2018-11-09 10:13:48
839
2017-11-09 15:28:04


In [25]:
def change_to_list_of_dicts(tweets_of_user):
    tweets = []
    for tweet in tweets_of_user:
        t = {}
        t['username'] = tweet.username
        t['fullname'] = tweet.fullname
        t['user_id'] = tweet.user_id
        t['tweet_id'] = tweet.tweet_id
        t['tweet_url'] = tweet.tweet_url
        t['timestamp'] = tweet.timestamp
        t['timestamp_epochs'] = tweet.timestamp_epochs
        t['replies'] = tweet.replies
        t['retweets'] = tweet.retweets
        t['likes'] = tweet.likes
        t['is_retweet'] = tweet.is_retweet
        t['retweeter_username'] = tweet.retweeter_username
        t['retweeter_userid'] = tweet.retweeter_userid
        t['retweet_id'] = tweet.retweet_id
        t['text'] = tweet.text
        t['html'] = tweet.html
        tweets.append(t)
    return tweets

In [26]:
tweets_per_user_dicts = []
for user in tweets_per_user:
    tweets = change_to_list_of_dicts(user)
    tweets_per_user_dicts.append(tweets)

In [35]:
tweets_per_user_df = []
for tweets_user in tweets_per_user_dicts:
    cols=['username', 'fullname', 'user_id', 'tweet_id', 'tweet_url', 'timestamp', 'timestamp_epochs', 'replies', 'retweets', 'likes', 'is_retweet', 'retweeter_username', 'retweeter_userid', 'retweet_id', 'text', 'html']
    data_frame = pd.DataFrame(tweets_user, columns=cols)
#     data_frame.index.name = "ID"
    data_frame.sort_values(by="timestamp", ascending=False, inplace=True, kind='quicksort', na_position='last')
    tweets_per_user_df.append(data_frame)

In [38]:
tweets_per_user_df[1].head()

Unnamed: 0,username,fullname,user_id,tweet_id,tweet_url,timestamp,timestamp_epochs,replies,retweets,likes,is_retweet,retweeter_username,retweeter_userid,retweet_id,text,html
0,RozeckaPL,M.Stachowiak Różecka,2205387775,1184095480583806976,/RozeckaPL/status/1184095480583806976,2019-10-15 13:15:36,1571145336,47,80,644,0,,,,Dziękuję! \n#DobryCzasPL #DobryCzasDlaPolski #...,"<p class=""TweetTextSize TweetTextSize--normal ..."
1,RozeckaPL,M.Stachowiak Różecka,2205387775,1182611413531004928,/RozeckaPL/status/1182611413531004928,2019-10-11 10:58:27,1570791507,38,161,410,0,,,,13 października proszę o Państwa głosy \nLista...,"<p class=""TweetTextSize TweetTextSize--normal ..."
2,jbrudzinski,Joachim Brudziński,114159548,1181618130709286912,/jbrudzinski/status/1181618130709286912,2019-10-08 17:11:30,1570554690,49,523,773,1,RozeckaPL,2205387775.0,1.1818183486880276e+18,Podajcie proszę dalej. Walczymy i mobilizujemy...,"<p class=""TweetTextSize TweetTextSize--normal ..."
3,tvp_info,portal tvp.info 🇵🇱,47295451,1180552173945528320,/tvp_info/status/1180552173945528320,2019-10-05 18:35:46,1570300546,25,85,241,1,RozeckaPL,2205387775.0,1.1808013508017316e+18,.@RozeckaPL w #GośćWiadomości o #TaśmyNeumanna...,"<p class=""TweetTextSize TweetTextSize--normal ..."
4,RozeckaPL,M.Stachowiak Różecka,2205387775,1180488867847249921,/RozeckaPL/status/1180488867847249921,2019-10-05 14:24:13,1570285453,9,9,50,0,,,,Dziś będę gościem programu „Gość wiadomości” @...,"<p class=""TweetTextSize TweetTextSize--normal ..."


In [39]:
import pickle

with open('tweets.pkl', 'wb') as f:
    pickle.dump(tweets_per_user_df, f)