In [1]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json

In [2]:
# no tweets will be found for a date older than one week.
# https://docs.tweepy.org/en/stable/api.html#search-tweets


# this one needs academic access to twitter apiu
#https://docs.tweepy.org/en/stable/client.html#search-tweets


 

### Parameters

In [2]:

# Twitter stuff

credentials_path = './credentials.json'

with io.open(credentials_path) as f_in:
    credentials = json.load(f_in)


access_token = credentials["access_token"]
access_token_secret = credentials["access_token_secret"]


api_key = credentials["api_key"]
api_secret = credentials["api_secret"]
bearer_token = credentials["bearer_token"]

consumer_key = api_key
consumer_secret = api_secret

df_path = './tweets.csv'

In [4]:
tweet_queries_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should ban unsustainable food production",
    "We should add meat quotas",
    "We should add labels to sustainable food produce",
    "We should exempt meat production from carbon taxes",
    "We should tax unsustainable farming practices",
    "We should add a carbon tax to food production",
] 

tweet_queries_da = [
    "Vi bør stoppe med at subsidiere kød",
    "Vi bør subsidiere grøn ernæring",
    "Vi skal øge produktionen af kød",
    "Vi bør forbyde uholdbar fødevareproduktion",
    "Vi bør tilføje kvoter på kød",
    "Vi bør tilføje mærker til bæredygtige fødevarer",
    "Vi bør fritage kød produktion fra co2 afgifter",
    "Vi bør beskatte ikke bæredygtige landbrugs metoder",
    "Vi bør tilføje en co2 afgift til fødevare produktionen",
] 

filters = '-filter:retweets -filter:links -filter:quote -filter:videos' 
num_tweets_per_query = 1000
min_tweet_length = 10
tweet_lang = 'da'#'en' # 'da'
token_mention = '<MENTION>'

In [13]:
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
nltk.download('punkt')
nltk.download('stopwords')

def generateQueries(queries, lang):
    nQueries = []
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    for q in queries:
        words = word_tokenize(q.lower())                        
        meaningful_words = [w for w in words if not w in stops]   
        words = ([stemmer.stem(w) for w in meaningful_words])
        
        for comb in itertools.combinations(meaningful_words, 2):
            nQueries.append((' '.join(comb), q))
        
    return nQueries

queries_da = generateQueries(tweet_queries_da, 'da')
queries_en = generateQueries(tweet_queries_en, 'en')

[nltk_data] Downloading package punkt to /home/docker/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/docker/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True)

In [7]:
def scrape_tweets(api, queries, lang):    
    tweet_dicts = []
    
    for (query, topic) in queries:
        print("="*100)
        print('Scraping tweets for:', query, topic)
        res = tweepy.Cursor(api.search_tweets, q = query + ' ' + filters, lang = lang, count = num_tweets_per_query, tweet_mode = 'extended').items(num_tweets_per_query)
        curLen = len(tweet_dicts)
        for i, t in enumerate(res):
            tweet_dicts.append({
                "tweet": re.sub('@[^\s]+', token_mention, t.full_text),
                "id": t.id,
                "topic": topic,
                "lang": t.lang,
            })
        print('Scraped',len(tweet_dicts) - curLen, 'tweets')
        
        # The 12 seconds could maybe be calculated depeding on number of queries
        #sleep(12*(i+1-num_tweets_per_query)/num_tweets_per_query) # Sleep for 12 seconds, so that we don't do more than 180 request over 15 mins

    return pd.DataFrame(tweet_dicts)

In [14]:
res_da = scrape_tweets(api, queries_da, 'da')
res_en = scrape_tweets(api, queries_en, 'en')

Scraping tweets for: stop subsidizing We should stop subsidizing meat
Scraped 325 tweets
Scraping tweets for: stop meat We should stop subsidizing meat
Scraped 1000 tweets
Scraping tweets for: subsidizing meat We should stop subsidizing meat
Scraped 11 tweets
Scraping tweets for: subsidize green We should subsidize green nutrition
Scraped 43 tweets
Scraping tweets for: subsidize nutrition We should subsidize green nutrition
Scraped 0 tweets
Scraping tweets for: green nutrition We should subsidize green nutrition
Scraped 32 tweets
Scraping tweets for: increase production We have to increase production of meat
Scraped 1000 tweets
Scraping tweets for: increase meat We have to increase production of meat
Scraped 230 tweets
Scraping tweets for: production meat We have to increase production of meat
Scraped 361 tweets
Scraping tweets for: ban unsustainable We should ban unsustainable food production
Scraped 13 tweets
Scraping tweets for: ban food We should ban unsustainable food production
S

Rate limit reached. Sleeping for: 504


Scraped 1000 tweets
Scraping tweets for: add meat We should add meat quotas
Scraped 823 tweets
Scraping tweets for: add quotas We should add meat quotas
Scraped 10 tweets
Scraping tweets for: meat quotas We should add meat quotas
Scraped 1 tweets
Scraping tweets for: add labels We should add labels to sustainable food produce
Scraped 150 tweets
Scraping tweets for: add sustainable We should add labels to sustainable food produce
Scraped 119 tweets
Scraping tweets for: add food We should add labels to sustainable food produce
Scraped 1000 tweets
Scraping tweets for: add produce We should add labels to sustainable food produce
Scraped 285 tweets
Scraping tweets for: labels sustainable We should add labels to sustainable food produce
Scraped 8 tweets
Scraping tweets for: labels food We should add labels to sustainable food produce
Scraped 280 tweets
Scraping tweets for: labels produce We should add labels to sustainable food produce
Scraped 37 tweets
Scraping tweets for: sustainable food 

Rate limit reached. Sleeping for: 765


Scraped 647 tweets
Scraping tweets for: carbon production We should add a carbon tax to food production
Scraped 526 tweets
Scraping tweets for: tax food We should add a carbon tax to food production
Scraped 1000 tweets
Scraping tweets for: tax production We should add a carbon tax to food production
Scraped 717 tweets
Scraping tweets for: food production We should add a carbon tax to food production
Scraped 1000 tweets


In [21]:
def clean_df(df):
    df = df.drop_duplicates(subset = ['tweet', 'id'])
    df = df[df.tweet.str.split().str.len() > min_tweet_length]
    return df

res_da = clean_df(res_da)
res_en = clean_df(res_en)
res = pd.concat([res_da, res_en])
res.to_csv('scraped_tweets_new.csv', index=False)

In [22]:
res['score'] = [0]*len(res)

In [24]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])


def score(df):
    df = df.copy()
    for topic in df.topic.unique():
        print('Calculating scores for:', topic)
        lang = df[df.topic == topic].lang.values[0]
        corpus = list(map(lambda x: pre(x, lang),df[df.topic == topic].tweet.values))

        vectorizer = TfidfVectorizer()
        tfidf_mtx = vectorizer.fit_transform(corpus)
        topic_mtx = vectorizer.transform([pre(topic, lang)])
        
        cos_score = cosine_similarity(tfidf_mtx, topic_mtx)
        
        df.loc[df.topic == topic, ['score']] = cos_score
    print('Done')
    return df

scored = score(res)

        

Calculating scores for: Vi bør stoppe med at subsidiere kød
Calculating scores for: Vi bør subsidiere grøn ernæring
Calculating scores for: Vi skal øge produktionen af kød
Calculating scores for: Vi bør forbyde uholdbar fødevareproduktion
Calculating scores for: Vi bør tilføje kvoter på kød
Calculating scores for: Vi bør tilføje mærker til bæredygtige fødevarer
Calculating scores for: Vi bør fritage kød produktion fra co2 afgifter
Calculating scores for: Vi bør beskatte ikke bæredygtige landbrugs metoder
Calculating scores for: Vi bør tilføje en co2 afgift til fødevare produktionen
Calculating scores for: We should stop subsidizing meat
Calculating scores for: We should subsidize green nutrition
Calculating scores for: We have to increase production of meat
Calculating scores for: We should ban unsustainable food production
Calculating scores for: We should add meat quotas
Calculating scores for: We should add labels to sustainable food produce
Calculating scores for: We should exempt 

In [25]:
top = scored.sort_values(by=['score'], ascending=False)

In [34]:
i = 0
for r in top.head(i+10).iloc[i:].iloc:
    print('Topic:', r.topic, 'Tweet:', r.tweet)
    print('='*120)

Topic: We should exempt meat production from carbon taxes Tweet: <MENTION> <MENTION> Substitute meat or artificial meat should be provided
reduce carbon emissions
make processed meat
Topic: We should add a carbon tax to food production Tweet: <MENTION> <MENTION> Add in cutting of production / estimates on products.
Topic: Vi skal øge produktionen af kød Tweet: <MENTION> <MENTION> <MENTION> Det har intet med krigen at gøre - afgifter og politiske beslutninger burde give en mere plantebaseret produktion nu og øge forskningen i kultiveret kød. Det vil også øge fødevaresikkerheden, da begge løsninger er og vil blive langt mere effektive både ift. energi og areal.
Topic: We should stop subsidizing meat Tweet: <MENTION> <MENTION> How about stop subsidizing oil and start subsidizing electric cars so they’re affordable.
Topic: We should add labels to sustainable food produce Tweet: <MENTION> Or you could just eat food that has no labels
Topic: We should stop subsidizing meat Tweet: <MENTION> T