# Analysing Twitter's reaction using the Twitter API, Sentiment analysis and survey tech

### First, install packages and import libraries

In [1]:
!pip install datasmoothie-tally-client -q
!pip install vaderSentiment -q

In [2]:
import tally
import os
import requests
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import time
import pandas as pd
pd.set_option('display.max_colwidth', None)
import datetime


### Create the Tally dataset object and get the required tokens 

We store the tockens as environment variables, as these should never be shared in public repositories

In [3]:
# we store the tally key in an environment variable, get in touch to get your own key
dataset = tally.DataSet(api_key=os.environ.get('TALLY_API_KEY'))

# our twitter API access token
bearer_token = os.environ.get('TWITTER_BEARER_TOKEN')

headers = {"Authorization": "Bearer {}".format(bearer_token),"Content-Type": "application/json"}
search_api_path = 'https://api.twitter.com/2/tweets/search/recent'

# Part I: Run this section once for every date you are interested in

### Set the dates requested so that Twitter fetches results from a specific interval

This needs to be changhed every time the loop is run. For example, to get two dates worth of Tweets, changes this cell two times, with different dates, and run the loop below after each change.

In [4]:
required_date = datetime.datetime(2022, 12, 6, 18)
day_after = required_date.astimezone(datetime.timezone.utc) + datetime.timedelta(1)
start_time = required_date.astimezone(datetime.timezone.utc).isoformat()
end_time = day_after.isoformat()
(start_time, end_time)

('2022-12-06T18:00:00+00:00', '2022-12-07T18:00:00+00:00')

### Run a loop to fetch multiple pages of results, set the max at 5,000

We run this code each time we change the dates. We could parameterise it of course, but this is good for now.

In [5]:
analyzer = SentimentIntensityAnalyzer()

def run_twitter_search(search_term):
    search_params = {
        'query':'{} lang:en'.format(search_term), 
        'max_results':100, 
        'start_time':start_time,
        'end_time':end_time
    }
    search_result = requests.get(search_api_path,search_params,headers=headers)
    result_json = json.loads(search_result.content)

    result_pages = []
    if 'meta' not in result_json:
        print(result_json.keys())
        print(result_json)
    next_token = result_json['meta']['next_token']
    for i in range(0,50):
        time.sleep(0.1)
        next_page = requests.get(
            search_api_path,{
                'query':'{} lang:en'.format(search_term), 
                'max_results':100,
                'start_time':start_time,
                'end_time': end_time,
                'next_token':next_token},
            headers=headers)
        page_json = json.loads(next_page.content)
        result_pages.append(page_json['data'])
        if 'next_token' in page_json['meta']:
            next_token = page_json['meta']['next_token']
        else:
            return result_pages
    
    return result_pages


### Fetch twitter results for our three search terms

In [6]:
pages = run_twitter_search("#chatgpt -is:retweet -has:media")

Twitter returns json data objects with the history IDs, tweet ID and the tweet content.

In [9]:
pages[0][:3]

[{'edit_history_tweet_ids': ['1600540896210354176'],
  'id': '1600540896210354176',
  'text': '@RichardMCNgo https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT'},
 {'edit_history_tweet_ids': ['1600540813074878465'],
  'id': '1600540813074878465',
  'text': '😅😂🤣💀\n\nThis is great. #ChatGPT prompted to have Donald Trump explain #bitcoin: https://t.co/BFYlZZiD4o'},
 {'edit_history_tweet_ids': ['1600540657722220550'],
  'id': '1600540657722220550',
  'text': '@ThankYourNiceAI @Miles_Brundage https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT'}]

### Extract the results into one big dataframe, which we then clean up a bit

In [10]:
dfs = []
for page in pages:
    df = pd.read_json(json.dumps(page))
    dfs.append(df)
data = pd.concat(dfs)

# Tally doesn't allow varible names called "text"
data = data.rename(columns={'text':'tweet'})
data.drop('edit_history_tweet_ids', axis=1).head()

Unnamed: 0,id,tweet
0,1600540896210354176,@RichardMCNgo https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT
1,1600540813074878464,😅😂🤣💀\n\nThis is great. #ChatGPT prompted to have Donald Trump explain #bitcoin: https://t.co/BFYlZZiD4o
2,1600540657722220544,@ThankYourNiceAI @Miles_Brundage https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT
3,1600540507910082560,"I‘m participating in the #Pisces #AIGC Campaign to win $300 and #Freemint #NFT, thanks to @PiscesBaishui ’s #giveaway! #ChatGPT #OpenAI"
4,1600540463248789504,"Thanks to #ChatGPT, it truly feels like we're on the verge of a ""Quartz Crisis"" in tech"


### Run the sentiment analysis

In [11]:
data['sentiment'] = data['tweet'].apply(lambda tweet: analyzer.polarity_scores(tweet)['compound']*100)

Our data now has an extra column with the VADER compound score

In [13]:
data.head()

Unnamed: 0,edit_history_tweet_ids,id,tweet,sentiment
0,[1600540896210354176],1600540896210354176,@RichardMCNgo https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT,0.0
1,[1600540813074878465],1600540813074878464,😅😂🤣💀\n\nThis is great. #ChatGPT prompted to have Donald Trump explain #bitcoin: https://t.co/BFYlZZiD4o,91.36
2,[1600540657722220550],1600540657722220544,@ThankYourNiceAI @Miles_Brundage https://t.co/1B4fnpTldH\nhttps://t.co/HorzIGI2XW\nhttps://t.co/dCcAY24Fvl\nthese domains are for sale！\n#OpenAI #ChatGPT,0.0
3,[1600540507910082560],1600540507910082560,"I‘m participating in the #Pisces #AIGC Campaign to win $300 and #Freemint #NFT, thanks to @PiscesBaishui ’s #giveaway! #ChatGPT #OpenAI",79.01
4,[1600540463248789505],1600540463248789504,"Thanks to #ChatGPT, it truly feels like we're on the verge of a ""Quartz Crisis"" in tech",49.39


### Save this date's data as CSV

In [12]:
data.to_csv('data/twitter-data/twitter_data-chatgpt-nomedia-{}-plus-one-day.csv'.format(start_time))

# Part II: Gather the results from Part I and analyse

Once Part I has been run once for every date you need in the dataset, combine the files here to run the calclations and sig-testing

### Read the files for different dates

We should have one file for each time we ran Part I of the notebook.


In [17]:
location = ('data/twitter-data/twitter_data-chatgpt-nomedia-2022-',  ':00:00+00:00-plus-one-day.csv')
data30nov = pd.read_csv('11-30T18'.join(location))
data30nov['date'] = 'Nov 30'
data1des = pd.read_csv('12-01T18'.join(location))
data1des['date'] = 'Dec 1'
data2des = pd.read_csv('12-02T18'.join(location))
data2des['date'] = 'Dec 2'
data3des = pd.read_csv('12-03T18'.join(location))
data3des['date'] = 'Dec 3'
data4des = pd.read_csv('12-04T18'.join(location))
data4des['date'] = 'Dec 4'
data5des = pd.read_csv('12-05T18'.join(location))
data5des['date'] = 'Dec 5'
data6des = pd.read_csv('12-06T14'.join(location))
data6des['date'] = 'Dec 6'


In [18]:
data = pd.concat([data30nov, data1des, data2des, data3des, data4des, data5des, data6des])

### Remove unnecessary columns and save as CSV

We don't need the tweet itself anymore, so we drop it and only keep the sentiment score.

We then write our results as a CSV, for our survey data crunching tool to import.

In [21]:
data = data.drop(['tweet', 'edit_history_tweet_ids'], axis=1)
data['sentiment'] = data['sentiment'].round(0)
data.to_csv('data/twitter-data/twitter_chatgpt_30nov_6dec.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,id,sentiment,date
0,0,1598332657032536064,0.0,Nov 30
1,1,1598332343600943104,0.0,Nov 30
2,2,1598330641594175488,65.0,Nov 30
3,3,1598330384424390656,-38.0,Nov 30
4,4,1598329285932191744,0.0,Nov 30


### Load our CSV file into our Tally dataset

In [24]:
dataset.use_csv('data/twitter-data/twitter_chatgpt_30nov_6dec.csv')

In [26]:
meta = json.loads(dataset.qp_meta)
mylist = meta['columns']['date']['values']
mylist.insert(0, mylist.pop())
meta['columns']['date']['values'] = mylist
dataset.qp_meta = json.dumps(meta)

### Create our new, derived variable

We want a new variable where the sentiment is stored as positive, negative and neutral.

In [73]:
cond_map = [
    [1, "Negative", {'sentiment':list(range(-100,-5))}],
    [2, "Neutral", {'sentiment':list(range(-5, 6))}],
    [3, "Positive", {'sentiment':list(range(6, 100))}]
]

### Create our crosstabs to run our analysis

In [75]:
mean_result = dataset.crosstab(x='sentiment', y='date', ci=[], stats=['mean'], sig_level=[0.05])

In [79]:
mean_result.replace('null', '')

Unnamed: 0_level_0,Question,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Values,Nov 30,Dec 1,Dec 2,Dec 3,Dec 4,Dec 5,Dec 6
Unnamed: 0_level_2,Test-IDs,A,B,C,D,E,F,G
Question,Values,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
,Base,301.0,942.0,1321.0,1570.0,3054.0,3557.0,3137.0
,Mean,31.61794,26.3,26.7,25.6,25.8,25.9,26.8
,0.05,D.E.F,,,,,,


In [77]:
dataset.derive(name='positive_negative', label="Positive or negative", cond_maps=cond_map, qtype='single')

In [88]:
result = dataset.crosstab(x='positive_negative', y='date', ci=['c%'], sig_level=[0.05], decimals=0)

In [89]:
result.replace('null', '')

Unnamed: 0_level_0,Question,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0
Unnamed: 0_level_1,Values,Nov 30,Dec 1,Dec 2,Dec 3,Dec 4,Dec 5,Dec 6
Unnamed: 0_level_2,Test-IDs,A,B,C,D,E,F,G
Question,Values,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3
Positive or negative,Base,301.0,942.0,1321.0,1570.0,3054.0,3557.0,3137.0
Positive or negative,Negative,11.0,16.0,17.0,17.0,16.0,17.0,15.0
Positive or negative,0.05,,A,A,A,A,A,A
Positive or negative,Neutral,29.0,25.0,23.0,25.0,26.0,26.0,25.0
Positive or negative,0.05,C,,,,,C,
Positive or negative,Positive,60.0,59.0,60.0,58.0,58.0,57.0,59.0
Positive or negative,0.05,,,,,,,
