# Imports

In [11]:
import time
import string
import re
import nltk
import gensim
import gensim.corpora as corpora
import ujson as json
import pandas as pd
import collections
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
import ast
%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)

## Preparation

In [12]:
trump_df = pd.read_csv('tweets_01-08-2021.csv')
# convert date to date objects
trump_df['date'] = pd.to_datetime(trump_df['date'])
# Keep only tweets from 2020
trump_df = trump_df[trump_df.date>'2020']

In [13]:
print(trump_df.shape)
print(trump_df.date.min())
print(trump_df.date.max())

(12392, 9)
2020-01-01 00:09:42
2021-01-08 15:44:28


In [14]:
# Remove links
filt = trump_df['text'].str.contains(r'(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})', regex=True)
trump_df = trump_df[~filt]
# Remove all retweets
filt = trump_df['text'].str.contains(r'RT', regex=True)
trump_df = trump_df[~filt]

  return func(self, *args, **kwargs)


In [15]:
trump_df.head(2)

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
3,1304875170860015617,The Unsolicited Mail In Ballot Scam is a major...,f,f,Twitter for iPhone,80527,23502,2020-09-12 20:10:58,f
19,1325884977112883200,The threshold identification of Ballots is tur...,f,f,Twitter for iPhone,493076,100609,2020-11-09 19:36:26,f


In [16]:
# Keep only short tweets to let time to the reader to read the tweet in the visualization
trump_df = trump_df[trump_df.text.apply(lambda x: len(str(x)) < 100)]

In [17]:
for k, d in trump_df.groupby(pd.Grouper(key='date',freq='2W')):
    for index, row in d.iterrows():
        print(row['id'],row['date'], row['text'])
    print('---------','\n'*4)

1212184310389850119 2020-01-01 01:30:35 HAPPY NEW YEAR!
1213078681750573056 2020-01-03 12:44:30 Iran never won a war, but never lost a negotiation!
1213474475250987009 2020-01-04 14:57:14 95% Approval Rating in the Republican Party. Thank you!
--------- 




1214197038063243266 2020-01-06 14:48:27 IRAN WILL NEVER HAVE A NUCLEAR WEAPON!
1215249474043482113 2020-01-09 12:30:27 PRESIDENTIAL HARASSMENT!
1215405367229739008 2020-01-09 22:49:55 95% Approval Rating in the Republican Party. Thank you!
1215778392697720832 2020-01-10 23:32:11 Will be interviewed tonight by Laura @IngrahamAngle at 10pmE on @FoxNews. Enjoy!
1216001273725685761 2020-01-11 14:17:50 Nancy Pelosi will go down as the absolute worst Speaker of the House in U.S. history!
1217909231946477575 2020-01-16 20:39:23 I JUST GOT IMPEACHED FOR MAKING A PERFECT PHONE CALL!
1218523265821544450 2020-01-18 13:19:20 Tremendous surge in new housing construction in December, 16.9%, biggest in many years!
1218527317116575744 2020-01-18 1

1313075159721537537 2020-10-05 11:14:48 PEACE THROUGH STRENGTH (BRING OUR SOLDIERS HOME). VOTE!
1313220863735533570 2020-10-05 20:53:47 STOCK MARKET UP BIG, 466 Points!  28,149. Great News for America. Jobs, Jobs, Jobs!
1313243541959737349 2020-10-05 22:23:53 Will be back on the Campaign Trail soon!!! The Fake News only shows the Fake Polls.
1313489465139134464 2020-10-06 14:41:06 FEELING GREAT!
1313511340124917760 2020-10-06 16:08:02 REPEAL SECTION 230!!!
1313630105831383041 2020-10-06 23:59:57 Republicans need to get smart and confirm Nate Simington to the FCC ASAP! @SenatorWicker Thank you!
1314021500941369354 2020-10-08 01:55:13 Mike Pence is doing GREAT! She is a gaffe machine.
1314037081392525312 2020-10-08 02:57:08 Mike Pence WON BIG!
1314362808281321472 2020-10-09 00:31:27 I will be interviewed by @seanhannity tonight at 9:00 P.M. @FoxNews  Enjoy!
1314593632733847552 2020-10-09 15:48:40 Covid Relief Negotiations are moving along. Go Big!
1314594979440979968 2020-10-09 15:54:01 

# Tweet selection

In [18]:
selected_tweets = [
 1213078681750573056,
 1215405367229739008,
 1220742093250932736,
 1227258461106425857,
 1230280522363895809,
 1237004509156642816,
 1242092738973249536,
 1249132374547464193,
 1252331100367663104,
 1259155620713975810,
 1264329603939696640,
 1265608389905784834,
 1273405198698975232,
 1276629230617604096,
 1280116392990253056,
 1288933202812456968,
 1293513735533924352,
 1300171130326716418,
 1302969740374298624,
 1310217730218430465,
 1310740567803006977,
 1320045327102152706,
 1322065847893020672,
 1327978710809661440,
 1334858852337070083,
 1338145849784872965,   
 1343328708963299338,
 1346685023861272580  
]

In [21]:
# Select tweets for the export
df1 = trump_df[trump_df['id'].isin(selected_tweets)].sort_values(by='date')
df1

Unnamed: 0,id,text,isRetweet,isDeleted,device,favorites,retweets,date,isFlagged
56555,1213078681750573056,"Iran never won a war, but never lost a negotia...",f,f,Twitter for iPhone,303007,57253,2020-01-03 12:44:30,f
3706,1215405367229739008,95% Approval Rating in the Republican Party. T...,f,f,Twitter for iPhone,204774,23628,2020-01-09 22:49:55,f
7711,1220742093250932736,READ THE TRANSCRIPTS!,f,f,Twitter for iPhone,89041,16072,2020-01-24 16:16:09,f
5819,1227258461106425857,"New Stock Market RECORD. Congratulations, spen...",f,f,Twitter for iPhone,112444,20236,2020-02-11 15:49:53,f
8408,1230280522363895809,"Heading to Arizona, big rally and big crowd! #...",f,f,Twitter for iPhone,56119,9783,2020-02-19 23:58:28,f
3816,1237004509156642816,So much FAKE NEWS!,f,f,Twitter for iPhone,138432,26914,2020-03-09 13:17:12,f
9847,1242092738973249536,THIS IS WHY WE NEED BORDERS!,f,f,Twitter for iPhone,367221,59389,2020-03-23 14:16:00,f
7967,1249132374547464193,Mail in ballots substantially increases the ri...,f,f,Twitter for iPhone,193334,37966,2020-04-12 00:29:00,f
10761,1252331100367663104,I will be having a White House Press Conferenc...,f,f,Twitter for iPhone,124285,17025,2020-04-20 20:19:36,f
5880,1259155620713975810,CA25 is a Rigged Election. Trying to steal it ...,f,f,Twitter for iPhone,69281,20662,2020-05-09 16:17:48,f


# Export the tweets as records (row by row)

In [20]:
import json
res = json.loads(df1.to_json(orient='records'))


with open('tweet.json', 'w+') as outfile:
    json.dump(res, outfile)
