In [1]:
!pip install datasmoothie-tally-client -q
!pip install vaderSentiment -q

In [2]:
import tally
import os
import requests
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json
import time
import pandas as pd
pd.set_option('display.max_colwidth', None)
import datetime


## Create the Tally dataset object and get the required tokens 

In [3]:
# we store the tally key in an environment variable, get in touch to get your own key
dataset = tally.DataSet(api_key=os.environ.get('TALLY_API_KEY'))

# our twitter API access token
bearer_token = os.environ.get('TWITTER_BEARER_TOKEN')

headers = {"Authorization": "Bearer {}".format(bearer_token),"Content-Type": "application/json"}
search_api_path = 'https://api.twitter.com/2/tweets/search/recent'

## Set the dates requested so that Twitter fetches results from a specific interval

In [4]:
required_date = datetime.datetime(2022, 11, 6, 18)
day_after = required_date.astimezone(datetime.timezone.utc) + datetime.timedelta(1)
start_time = required_date.astimezone(datetime.timezone.utc).isoformat()
end_time = day_after.isoformat()
(start_time, end_time)

('2022-11-06T18:00:00+00:00', '2022-11-07T18:00:00+00:00')

## Create a loop so we can fetch and clean Twitter data - we only get 100 results per request

In [5]:
analyzer = SentimentIntensityAnalyzer()

def run_twitter_search(search_term):
    search_params = {
        'query':'{} lang:en'.format(search_term), 
        'max_results':100, 
        'start_time':start_time,
        'end_time':end_time
    }
    search_result = requests.get(search_api_path,search_params,headers=headers)
    result_json = json.loads(search_result.content)

    result_pages = []
    if 'meta' not in result_json:
        print(result_json.keys())
        print(result_json)
    next_token = result_json['meta']['next_token']
    for i in range(0,15):
        time.sleep(0.5)
        next_page = requests.get(
            search_api_path,{
                'query':'{} lang:en'.format(search_term), 
                'max_results':100,
                'start_time':start_time,
                'end_time': end_time,
                'next_token':next_token},
            headers=headers)
        page_json = json.loads(next_page.content)
        result_pages.append(page_json['data'])
        next_token = page_json['meta']['next_token']
    
    data = [] 
    tweet_id = []
    for page in result_pages:
        data = data + [j['text'] for j in page]
        tweet_id = tweet_id + [j['id'] for j in page]

    data = [i.replace("RT","").strip() for i in data]
    data = [i.replace(""," ").strip() for i in data]
    data = [re.sub('\B@\w+', "", i).replace(":", "").strip() for i in data]

    sentiment = [analyzer.polarity_scores(d)['compound']*100 for d in data]
    s = pd.Series(data=sentiment, name='sentiment').round(0)
    tweet_id = pd.Series(data=tweet_id, name='tweet_id')
    text = pd.Series(data=data, name='tweet')
    term = pd.Series(data=[search_term]*len(data), name='search term')
    df = pd.concat([term, tweet_id, s, text], axis=1)
    return df


## Fetch twitter results for our three search terms

In [13]:
first_data = run_twitter_search("rishi sunak")
second_data = run_twitter_search("keir starmer")
third_data = run_twitter_search("matt hancock")


In [None]:
data = pd.concat([first_data, second_data, third_data])

## Save data as CSV and load the CSV file into Tally

In [None]:
data.to_csv('data/twitter-data/twitter_data-{}-plus-one-day.csv'.format(start_time))

In [None]:
dataset.use_csv('data/twitter-data/twitter_data-{}-plus-one-day.csv'.format(start_time))

## Examples of how VADER works

In [14]:
print(analyzer.polarity_scores("I love Geir because he is clever."))

{'neg': 0.0, 'neu': 0.41, 'pos': 0.59, 'compound': 0.802}


In [15]:
print(analyzer.polarity_scores("Geir is getting on my nerves."))

{'neg': 0.219, 'neu': 0.781, 'pos': 0.0, 'compound': -0.1027}


In [16]:
print(analyzer.polarity_scores("Geir is the author of this post."))

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}


## Create our new, derived variable

In [51]:
cond_map = [
    [1, "Negative", {'sentiment':list(range(-100,-5))}],
    [2, "Neutral", {'sentiment':list(range(-5, 6))}],
    [3, "Positive", {'sentiment':list(range(6, 100))}]
]

## Create our crosstabs to run our analysis

In [23]:
dataset.crosstab(x='sentiment', y='search term', ci=[], stats=['mean'], sig_level=[0.05])

Unnamed: 0_level_0,Question,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0
Unnamed: 0_level_1,Values,keir starmer,matt hancock,rishi sunak
Unnamed: 0_level_2,Test-IDs,A,B,C
Question,Values,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3
,Base,496.0,499.0,496.0
,Mean,-4.939516,-5.132265,-12.4
,0.05,C,C,


In [58]:
dataset.set_variable_text(name='search term', new_text='Search term')

In [52]:
dataset.derive(name='positive_negaive', label="Positive or negative", cond_maps=cond_map, qtype='single')

In [61]:
result = dataset.crosstab(x='positive_negaive', y='search term', ci=['c%'], stats=['mean'], sig_level=[])

<table border="1" class="dataframe">  <thead>    <tr>      <th></th>      <th>Question</th>      <th colspan="3" halign="left">Search term</th>    </tr>    <tr>      <th></th>      <th>Values</th>      <th>keir starmer</th>      <th>matt hancock</th>      <th>rishi sunak</th>    </tr>    <tr>      <th></th>      <th>Test-IDs</th>      <th>A</th>      <th>B</th>      <th>C</th>    </tr>    <tr>      <th>Question</th>      <th>Values</th>      <th></th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th rowspan="9" valign="top">Positive or negative</th>      <th>Base</th>      <td>496.0</td>      <td>499.0</td>      <td>496.0</td>    </tr>    <tr>      <th>Negative</th>      <td>38.508065</td>      <td>38.076152</td>      <td>56.25</td>    </tr>    <tr>      <th></th>      <td></td>      <td></td>      <td>A.B</td>    </tr>    <tr>      <th>Neutral</th>      <td>26.612903</td>      <td>40.280561</td>      <td>14.717742</td>    </tr>    <tr>      <th></th>      <td>C</td>      <td>A.C</td>      <td></td>    </tr>    <tr>      <th>Positive</th>      <td>34.879032</td>      <td>21.643287</td>      <td>29.032258</td>    </tr>    <tr>      <th></th>      <td>B.C</td>      <td></td>      <td>B</td>    </tr>    <tr>      <th>Mean</th>      <td>1.96371</td>      <td>1.835671</td>      <td>1.727823</td>    </tr>    <tr>      <th></th>      <td>B.C</td>      <td>C</td>      <td></td>    </tr>  </tbody></table>