In [123]:
import json
import codecs
import pandas as pd
import numpy as np

In [124]:
with codecs.open("data/RepAbraham.json", 'r', encoding='utf8') as rf: 
    read_tweets = json.load(rf)
    read_tweets = [json.loads(t) for t in read_tweets]

In [125]:
len(read_tweets)

785

In [126]:
allKeys = set(read_tweets[3].keys())
for i,tweet in enumerate(read_tweets):
    for key in tweet.keys():
        if key not in allKeys:
            print(i, key)
            allKeys.add(key)

0 retweeted_status
4 media
52 in_reply_to_screen_name
52 in_reply_to_status_id
52 in_reply_to_user_id


In [127]:
allKeys

{'created_at',
 'favorite_count',
 'full_text',
 'hashtags',
 'id',
 'id_str',
 'in_reply_to_screen_name',
 'in_reply_to_status_id',
 'in_reply_to_user_id',
 'lang',
 'media',
 'quoted_status',
 'quoted_status_id',
 'quoted_status_id_str',
 'retweet_count',
 'retweeted_status',
 'source',
 'urls',
 'user',
 'user_mentions'}

In [128]:
read_tweets[0]["id_str"]

'1082284323548655616'

In [129]:
read_tweets[0]["id"]

1082284323548655616

In [130]:
keysICareAbout = ("created_at", "full_text", "retweet_count", "retweeted_status", "id_str")
keysICareAbout

('created_at', 'full_text', 'retweet_count', 'retweeted_status', 'id_str')

In [131]:
def getKey(tweet,key):
    try:
        return tweet[key]
    except KeyError:
        return np.nan

In [132]:
twoDTweets = [[getName(tweet,name) for name in keysICareAbout] for tweet in read_tweets]

In [133]:
df = pd.DataFrame(twoDTweets)

In [134]:
df.columns = list(keysICareAbout)

In [135]:
df["twitter_account"] = "RepAbraham"

In [136]:
df

Unnamed: 0,created_at,full_text,retweet_count,retweeted_status,id_str,twitter_account
0,Mon Jan 07 14:34:25 +0000 2019,RT @SteveScalise: Hi @AOC. Happy to continue t...,9447.0,{'created_at': 'Sun Jan 06 05:53:58 +0000 2019...,1082284323548655616,RepAbraham
1,Sat Jan 05 00:57:34 +0000 2019,My heart is so heavy tonight. Dianne and I are...,7.0,,1081353982344335361,RepAbraham
2,Fri Jan 04 20:47:34 +0000 2019,Just sent out my first newsletter of the 116th...,2.0,,1081291065204838408,RepAbraham
3,Thu Jan 03 19:37:26 +0000 2019,First day on the job and Democrats propose a f...,36.0,,1080911030283190273,RepAbraham
4,Wed Dec 26 18:35:00 +0000 2018,Today I announced #LA05's Congressional App Ch...,1.0,,1077996214149238784,RepAbraham
5,Tue Dec 25 15:20:00 +0000 2018,Merry Christmas from my family to yours! I hop...,3.0,,1077584753023479808,RepAbraham
6,Fri Dec 21 01:09:28 +0000 2018,I voted to fund the wall bc it is vital to nat...,16.0,,1075921159311552512,RepAbraham
7,Thu Dec 20 20:52:34 +0000 2018,RT @GregHilburn1: BREAKING: @realDonaldTrump s...,5.0,{'created_at': 'Thu Dec 20 20:51:29 +0000 2018...,1075856505604325377,RepAbraham
8,Thu Dec 20 20:15:39 +0000 2018,The Steve Gleason Congressional Gold Medal Act...,28.0,,1075847217028702209,RepAbraham
9,Wed Dec 19 17:13:02 +0000 2018,“Democrats feign concern for taxpayers who wou...,1.0,,1075438870471176192,RepAbraham


## Get tweets from all json files

In [137]:
dataDir = "/Volumes/GoogleDrive/My Drive/Yale/4/spring/Research/PolySpeech/WIL Twitter Data/"

In [138]:
twitter_accounts = [re.sub(".*/(.*)\\.json", "\\1", filepath) for filepath in glb]

In [139]:
def getcreated_at(tweet):
    return getName(tweet, "created_at")
def getfull_text(tweet):
    return getName(tweet, "full_text")
def getid_str(tweet):
    return getName(tweet, "id_str")
def getretweet_count(tweet):
    return getName(tweet, "retweet_count")
def getretweeted_status(tweet):
    if getName(tweet, "retweeted_status") is np.nan:
        return False
    return True
keyFuncs = (getcreated_at, getfull_text, getid_str, getretweet_count, getretweeted_status)

In [140]:
from progress import ProgressTracker

In [141]:
class TweetExtractor(object):
    def __init__(self, keys, keyFuncs, dataDir):
        self.keys = keys
        self.keyFuncs = keyFuncs
        self.tweetsDF = pd.DataFrame(columns=list(keysICareAbout))
        self.dataDir = dataDir
        
    def primeUSPoliticiansAccounts(self, maxNum=None):
        self.dfPol = pd.read_csv("data/Copy of US politicians_md.csv")
        np.random.seed(10)
        men = self.dfPol.loc[self.dfPol.loc[:,"gender"] == 'M',:].sample(int(maxNum/2))
        women = self.dfPol.loc[self.dfPol.loc[:,"gender"] == 'F',:].sample(int(maxNum/2))
        self.metaDatMin = pd.concat([women,men], sort=False)
        self.accts = self.metaDatMin.loc[:,"twitter_account"].dropna()
        
    def getTweetsFromTwitterAccounts(self, maxTweetNum=None):
        progress = ProgressTracker(len(self.accts))
        df_tot = pd.DataFrame(columns=self.keys)
        for i, acct in enumerate(self.accts):
            filepath = self.dataDir + acct + ".json"
            try:
                with codecs.open(filepath, 'r', encoding='utf8') as rf: 
                    read_tweets = json.load(rf)[:maxTweetNum]
                    read_tweets = [json.loads(t) for t in read_tweets]
            except FileNotFoundError:
                # Explains why groupedDF in IBM-Feed is not size 100; not everyone has collected tweets.
                # print(acct, "not found")
                continue
            twoDTweets = [[func(tweet) for func in self.keyFuncs] for tweet in read_tweets]
            df = pd.DataFrame(twoDTweets)
            df.columns = self.keys
            df["twitter_account"] = acct
            self.tweetsDF = pd.concat([self.tweetsDF, df], sort=False)
            progress.update(i)
        print("Finished")
        
    def writeOut(self, outfile="data/tweets.csv"):
        self.metaDatMin.to_csv("data/metaDatMin.csv", index=False)
        self.tweetsDF.to_csv(outfile, index=False)

In [142]:
ext = TweetExtractor(keysICareAbout, keyFuncs, dataDir)
ext.primeUSPoliticiansAccounts(100)
ext.getTweetsFromTwitterAccounts(100)
ext.writeOut()

5 percent done
10 percent done
15 percent done
20 percent done
25 percent done
30 percent done
35 percent done
40 percent done
45 percent done
50 percent done
55 percent done
60 percent done
65 percent done
70 percent done
75 percent done
80 percent done
85 percent done
90 percent done
95 percent done
Finished


In [143]:
tweetsDF = pd.read_csv("data/tweets.csv")

In [144]:
tweetsDF

Unnamed: 0,created_at,full_text,retweet_count,retweeted_status,id_str,twitter_account
0,Mon Jan 07 23:20:11 +0000 2019,Thank you! https://t.co/QHR3So39yV,1082416637033267203,5.0,False,RepSylviaGarcia
1,Mon Jan 07 20:30:29 +0000 2019,"#TrumpShutdown not just about federal workers,...",1082373932701941760,21.0,False,RepSylviaGarcia
2,Mon Jan 07 14:21:29 +0000 2019,And bring the agents responsible for the two c...,1082281069204963329,12.0,False,RepSylviaGarcia
3,Sun Jan 06 22:28:05 +0000 2019,"For Texans, the president’s demand for a wall ...",1082041138163392513,11.0,False,RepSylviaGarcia
4,Sun Jan 06 16:39:38 +0000 2019,RT @LeaderHoyer: Joined @MeetThePress to discu...,1081953447929364481,70.0,True,RepSylviaGarcia
5,Sun Jan 06 14:49:09 +0000 2019,You won’t get a tax refund until the #TrumpShu...,1081925645616246784,12.0,False,RepSylviaGarcia
6,Sun Jan 06 14:29:51 +0000 2019,RT @HouseDemocrats: House Democrats were busy ...,1081920789237194752,2151.0,True,RepSylviaGarcia
7,Sat Jan 05 22:04:51 +0000 2019,"'Si, se puede': “Rep. Sylvia Garcia, D-Texas, ...",1081672904763473920,6.0,False,RepSylviaGarcia
8,Sat Jan 05 21:05:37 +0000 2019,We must ensure the man who shot and killed Jaz...,1081657998005727235,36.0,False,RepSylviaGarcia
9,Sat Jan 05 19:45:59 +0000 2019,"#nobabyjails “Southwest Key Programs, faces mo...",1081637958422351873,11.0,False,RepSylviaGarcia


In [17]:
import glob

In [18]:
glb = glob.glob(dataDir+"*")

In [19]:
import re

In [20]:
name1 = glob.glob(dataDir)[0]

In [85]:
ext = TweetExtractor(keysICareAbout, keyFuncs, dataDir)
ext.primeUSPoliticiansAccounts()

In [86]:
ext.dfPol.loc[]

Unnamed: 0.1,Unnamed: 0,api_uri,at_large,contact_form,crp_id,cspan_id,date_of_birth,district,dw_nominate,facebook_account,...,state_rank,suffix,title,total_present,total_votes,twitter_account,url,votes_with_party_pct,votesmart_id,youtube_account
0,0,https://api.propublica.org/congress/v1/members...,False,,N00036633,76236.0,1954-09-16 00:00:00,5,0.497,CongressmanRalphAbraham,...,,,Representative,0.0,976.0,RepAbraham,https://abraham.house.gov,97.11,155414.0,
1,1,https://api.propublica.org/congress/v1/members...,False,,N00035451,76386.0,1946-05-27 00:00:00,12,-0.465,CongresswomanAdams,...,,,Representative,0.0,976.0,RepAdams,https://adams.house.gov,95.68,5935.0,
2,2,https://api.propublica.org/congress/v1/members...,False,,N00003028,45516.0,1965-07-22 00:00:00,4,0.360,RobertAderholt,...,,,Representative,0.0,976.0,Robert_Aderholt,https://aderholt.house.gov,96.62,441.0,RobertAderholt
3,3,https://api.propublica.org/congress/v1/members...,False,,N00033997,79994.0,1979-06-19 00:00:00,31,-0.284,reppeteaguilar,...,,,Representative,0.0,976.0,reppeteaguilar,https://aguilar.house.gov,92.13,70114.0,
4,4,https://api.propublica.org/congress/v1/members...,False,,N00033720,62545.0,1951-11-07 00:00:00,12,0.626,CongressmanRickAllen,...,,,Representative,0.0,976.0,reprickallen,https://allen.house.gov,97.83,136062.0,
5,5,https://api.propublica.org/congress/v1/members...,False,,N00031938,1033767.0,1980-04-18 00:00:00,3,0.658,repjustinamash,...,,,Representative,1.0,976.0,,https://amash.house.gov,69.95,105566.0,repjustinamash
6,6,https://api.propublica.org/congress/v1/members...,False,,N00031177,62817.0,1958-06-12 00:00:00,2,0.378,MarkAmodeiNV2,...,,,Representative,2.0,976.0,MarkAmodeiNV2,https://amodei.house.gov,95.36,12537.0,markamodeinv2
7,7,https://api.propublica.org/congress/v1/members...,False,,N00038285,1016482.0,1972-03-09 00:00:00,19,0.555,JodeyArrington,...,,,Representative,0.0,976.0,RepArrington,https://arrington.house.gov,98.45,155685.0,
8,8,https://api.propublica.org/congress/v1/members...,False,,N00005736,44883.0,1948-03-23 00:00:00,36,0.717,RepBrianBabin,...,,,Representative,0.0,976.0,RepBrianBabin,https://babin.house.gov,95.20,360.0,
9,9,https://api.propublica.org/congress/v1/members...,False,,N00037049,103442.0,1963-08-16 00:00:00,2,0.442,,...,,,Representative,0.0,976.0,RepDonBacon,https://bacon.house.gov,96.00,166299.0,


In [115]:
pd.DataFrame.to_csv?

In [73]:
ext.tweetsDF

Unnamed: 0,created_at,full_text,retweet_count,retweeted_status,id_str,twitter_account
0,Mon Jan 07 14:34:25 +0000 2019,RT @SteveScalise: Hi @AOC. Happy to continue t...,1082284323548655616,9447.0,True,RepAbraham
1,Sat Jan 05 00:57:34 +0000 2019,My heart is so heavy tonight. Dianne and I are...,1081353982344335361,7.0,False,RepAbraham
2,Fri Jan 04 20:47:34 +0000 2019,Just sent out my first newsletter of the 116th...,1081291065204838408,2.0,False,RepAbraham
3,Thu Jan 03 19:37:26 +0000 2019,First day on the job and Democrats propose a f...,1080911030283190273,36.0,False,RepAbraham
4,Wed Dec 26 18:35:00 +0000 2018,Today I announced #LA05's Congressional App Ch...,1077996214149238784,1.0,False,RepAbraham
5,Tue Dec 25 15:20:00 +0000 2018,Merry Christmas from my family to yours! I hop...,1077584753023479808,3.0,False,RepAbraham
6,Fri Dec 21 01:09:28 +0000 2018,I voted to fund the wall bc it is vital to nat...,1075921159311552512,16.0,False,RepAbraham
7,Thu Dec 20 20:52:34 +0000 2018,RT @GregHilburn1: BREAKING: @realDonaldTrump s...,1075856505604325377,5.0,True,RepAbraham
8,Thu Dec 20 20:15:39 +0000 2018,The Steve Gleason Congressional Gold Medal Act...,1075847217028702209,28.0,False,RepAbraham
9,Wed Dec 19 17:13:02 +0000 2018,“Democrats feign concern for taxpayers who wou...,1075438870471176192,1.0,False,RepAbraham


In [121]:
tweetsDF

Unnamed: 0,created_at,full_text,retweet_count,retweeted_status,id_str,twitter_account
0,Mon Jan 07 23:20:11 +0000 2019,Thank you! https://t.co/QHR3So39yV,1082416637033267203,5.0,False,RepSylviaGarcia
1,Mon Jan 07 20:30:29 +0000 2019,"#TrumpShutdown not just about federal workers,...",1082373932701941760,21.0,False,RepSylviaGarcia
2,Mon Jan 07 14:21:29 +0000 2019,And bring the agents responsible for the two c...,1082281069204963329,12.0,False,RepSylviaGarcia
3,Sun Jan 06 22:28:05 +0000 2019,"For Texans, the president’s demand for a wall ...",1082041138163392513,11.0,False,RepSylviaGarcia
4,Sun Jan 06 16:39:38 +0000 2019,RT @LeaderHoyer: Joined @MeetThePress to discu...,1081953447929364481,70.0,True,RepSylviaGarcia
5,Sun Jan 06 14:49:09 +0000 2019,You won’t get a tax refund until the #TrumpShu...,1081925645616246784,12.0,False,RepSylviaGarcia
6,Sun Jan 06 14:29:51 +0000 2019,RT @HouseDemocrats: House Democrats were busy ...,1081920789237194752,2151.0,True,RepSylviaGarcia
7,Sat Jan 05 22:04:51 +0000 2019,"'Si, se puede': “Rep. Sylvia Garcia, D-Texas, ...",1081672904763473920,6.0,False,RepSylviaGarcia
8,Sat Jan 05 21:05:37 +0000 2019,We must ensure the man who shot and killed Jaz...,1081657998005727235,36.0,False,RepSylviaGarcia
9,Sat Jan 05 19:45:59 +0000 2019,"#nobabyjails “Southwest Key Programs, faces mo...",1081637958422351873,11.0,False,RepSylviaGarcia


0.0 % completed -- 
4.84027105517909 % completed -- 
9.68054211035818 % completed -- 
14.52081316553727 % completed -- 
19.36108422071636 % completed -- 
24.20135527589545 % completed -- 
29.04162633107454 % completed -- 
33.88189738625363 % completed -- 
38.72216844143272 % completed -- 
43.56243949661181 % completed -- 
48.4027105517909 % completed -- 
53.24298160696999 % completed -- 
58.08325266214908 % completed -- 
62.92352371732817 % completed -- 
67.76379477250725 % completed -- 
72.60406582768636 % completed -- 
77.44433688286544 % completed -- 
82.28460793804453 % completed -- 
87.12487899322362 % completed -- 
91.9651500484027 % completed -- 
96.8054211035818 % completed -- 


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [142]:
len(df_tot)

2933524

In [143]:
df_tot.to_csv("data/tweetsDF.csv", index=False)
df1 = pd.read_csv("data/tweetsDF.csv")

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [148]:
minDF = df_tot.loc[:,("full_text","twitter_account")]

In [150]:
minDF.to_csv("data/tweetsDFMin.csv")

In [151]:
pd.read_csv("data/tweetsDFMin.csv")

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.


In [152]:
import csv
with open(r"data/tweetsDFMin.csv", 'rb') as f:
    reader = csv.reader(f)
    linenumber = 1
    try:
        for row in reader:
            linenumber += 1
    except Exception as e:
        print (("Error line %d: %s %s" % (linenumber, str(type(e)), e.message)))

AttributeError: 'Error' object has no attribute 'message'

In [144]:
len(df1)

8407

In [132]:
import time
import sys

toolbar_width = 40

# setup toolbar
sys.stdout.write("[%s]" % (" " * toolbar_width))
sys.stdout.flush()
sys.stdout.write("\b" * (toolbar_width+1)) # return to start of line, after '['

for i in range(toolbar_width):
    time.sleep(0.1) # do real work here
    # update the bar
    sys.stdout.write("-")
    sys.stdout.flush()

sys.stdout.write("\n")

[                                        ----------------------------------------


In [34]:
pd.DataFrame.from_dict({"a":[1],"b":[2]})

Unnamed: 0,a,b
0,1,2
