In [1]:
import pandas as pd
from random import choice
from twython import Twython
import seaborn as sns
import matplotlib.pyplot as plt

**Table for intro:**

In [2]:
sources_full = ['The New York Times', 'CNN', 'BBC (World)', 'The Economist', 'Reuters', 'The Wall Street Journal',
                'TIME', 'ABC News', 'The Washington Post', 'The Associated Press']
sources = ['nytimes', 'CNN', 'bbcworld', 'theeconomist', 'reuters', 'WSJ', 'TIME', 'ABC', 'washingtonpost', 'AP']
df = pd.DataFrame(index = sources_full)

In [63]:
#producing the table for the intro section
df['Twitter_Handle'] = sources
df['Country'] = ['US', 'US', 'UK', 'UK', 'UK', 'US', 'US', 'US', 'US', 'US']
df['Num_Followers (M)'] = [46.8, 48.5, 28.2, 24.8, 22, 17.8, 17.2, 15.7, 15.9, 14.2]
print('Total number of followers:', df['Num_Followers (M)'].sum(), 'million')
df

Total number of followers: 251.1 million


Unnamed: 0,Twitter_Handle,Country,Num_Followers (M)
The New York Times,nytimes,US,46.8
CNN,CNN,US,48.5
BBC (World),bbcworld,UK,28.2
The Economist,theeconomist,UK,24.8
Reuters,reuters,UK,22.0
The Wall Street Journal,WSJ,US,17.8
TIME,TIME,US,17.2
ABC News,ABC,US,15.7
The Washington Post,washingtonpost,US,15.9
The Associated Press,AP,US,14.2


**Getting example negative, neutral, and positive tweets for methods section:**

In [8]:
#getting subsets
dfA = pd.read_csv('DATA_W_ANALYSIS/' + sources[0] + '_FINAL.csv', index_col = 'ID', \
                  usecols = ['ID', 'User', 'Tweet_Text', 'Sentiment'])

for x in sources[1 : ]:
    dfA = dfA.append(pd.read_csv('DATA_W_ANALYSIS/' + x + '_FINAL.csv', index_col = 'ID', \
                                     usecols = ['ID', 'User', 'Tweet_Text', 'Sentiment']))
    
pos_sub = dfA[(dfA['Sentiment'] == 1) & ~(dfA['Tweet_Text'].fillna('').str.startswith('RT'))]
part_pos_sub = dfA[(dfA['Sentiment'] == 0.5) & ~(dfA['Tweet_Text'].fillna('').str.startswith('RT'))]
neg_sub = dfA[(dfA['Sentiment'] == -1) & ~(dfA['Tweet_Text'].fillna('').str.startswith('RT'))]
part_neg_sub = dfA[(dfA['Sentiment'] == -0.5) & ~(dfA['Tweet_Text'].fillna('').str.startswith('RT'))]
neu_sub = dfA[(dfA['Sentiment'] == 0) & ~(dfA['Tweet_Text'].fillna('').str.startswith('RT'))]

In [49]:
#picking out example tweets
c1 = choice(pos_sub.index)
c2 = choice(part_pos_sub.index)
c3 = choice(neg_sub.index)
c4 = choice(part_neg_sub.index)
c5 = choice(neu_sub.index)

df = pd.DataFrame()

# print(neg_sub.loc[c2])
dfs = [pos_sub, part_pos_sub, neu_sub, part_neg_sub, neg_sub]
choices = [c1, c2, c5, c4, c3]

df['Polarity'] = [x.loc[c].loc['Sentiment'] for x, c in zip(dfs, choices)]
df['Tweet_Text'] = [x.loc[c].loc['Tweet_Text'] for x, c in zip(dfs, choices)]
df['User'] = [x.loc[c].loc['User'] for x, c in zip(dfs, choices)]

In [50]:
pd.set_option('display.max_colwidth', -1)

df['User'] = df['User'].str.lstrip('@').replace({'BBCWorld' : 'bbcworld', 'Reuters' : 'reuters', 'TheEconomist' : 'theeconomist'})
df.set_index('User', inplace = True)

In [51]:
df

Unnamed: 0_level_0,Polarity,Tweet_Text
User,Unnamed: 1_level_1,Unnamed: 2_level_1
CNN,1.0,"Legendary jazz trumpeter Wallace Roney has died of complications from the novel coronavirus, his publicist said. He was 59."
reuters,0.5,ICYMI: Scientists are deriving grease from insects to replace butter which they say is more sustainable than dairy production
washingtonpost,0.0,"With NFL front offices working from home, the draft process looks a lot different this year"
WSJ,-0.5,The Dow is on pace to register one of its worst weeks ever as governments and central banks announced measures to cushion impact of the coronavirus
ABC,-1.0,Coronavirus and 2020 campaigns: Race looks grim as states continue to delay primaries.


In [39]:
#checking that the SA example tweets are still up on Twitter
creds = {}
with open('tInfo.txt', 'r') as f:
    c = f.read().split('\n')
    creds['Consumer_Key'] = c[0]
    creds['Consumer_Secret'] = c[1]
    creds['Access_Key'] = c[2]
    creds['Access_Secret'] = c[3]

py_tweets = Twython(creds['Consumer_Key'], creds['Consumer_Secret'])

full = {}

for i in choices:
    query = {'id' : i, 'tweet_mode' : 'extended'}
    full[i] = py_tweets.show_status(**query)['full_text']
    
full

{1238568393755832320: 'US stocks finished a turbulent week with gains on Friday, logging their best day since October 2008 https://t.co/Tbs034erlB',
 1237533613312184320: 'MORE: Among those looking instead for one who “can bring needed change” (35% of voters), Sanders prevailed, 53-44%, per preliminary exit poll results. https://t.co/A7HACTJT9A',
 1243293934375317504: 'Big banks reassure staff about potential job cuts https://t.co/dZ0QCLXnYc https://t.co/ojOQtPQbSu',
 1242126731034669057: 'Opinion: The GOP just smuggled another awful provision into the big stimulus bill https://t.co/kju8xZphDF',
 1243124099083354112: "Japan says virus has made economy's condition 'severe', worst view in seven years https://t.co/zIWlOgdfj0 https://t.co/5ryagun91B"}

**Methods section table:**

In [69]:
pd.read_csv('CUT_DATA/descriptive_dataset.csv', index_col = 0).drop(columns = ['Unique_Tweets', 'Num_Retweets', 'Num_Truncated', 'Percent_Truncated'])

Unnamed: 0,Num_Tweets,Percent_Unique,Percent_Retweets
nytimes,3207,0.963,0.198
CNN,4615,0.796,0.039
bbcworld,1186,0.995,0.317
theeconomist,3179,0.681,0.022
reuters,12030,0.796,0.034
WSJ,2633,0.93,0.009
TIME,2398,0.626,0.047
ABC,4649,0.81,0.051
washingtonpost,4130,0.959,0.055
AP,1764,0.966,0.472


**Making dehydrated datasets:**

In [3]:
#the data used for the paper
for s in sources:
    df = pd.read_csv('CUT_DATA/' + s + '.csv', index_col = 'ID')
    with open('dehydrated_data/' + s + '.txt', 'w') as f:
        for i in df.index:
            f.write(str(i) + '\n')

In [4]:
#the extended data set - all tweets that were collected
for s in sources:
    df = pd.read_csv('Final_Data/' + s + '.csv', index_col = 'ID')
    with open('full_dehydrated_data/' + s + '.txt', 'w') as f:
        for i in df.index:
            f.write(str(i) + '\n')