In [1]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json

In [2]:
# no tweets will be found for a date older than one week.
# https://docs.tweepy.org/en/stable/api.html#search-tweets


# this one needs academic access to twitter apiu
#https://docs.tweepy.org/en/stable/client.html#search-tweets


### Parameters

In [3]:

# Twitter stuff

credentials_path = './credentials.json'

with io.open(credentials_path) as f_in:
    credentials = json.load(f_in)


access_token = credentials["access_token"]
access_token_secret = credentials["access_token_secret"]


api_key = credentials["api_key"]
api_secret = credentials["api_secret"]
bearer_token = credentials["bearer_token"]

consumer_key = api_key
consumer_secret = api_secret

df_path = './tweets.csv'

In [4]:
tweet_queries_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should exempt meat production from carbon taxes",
    "We should add a carbon tax to food production",
    "Meat alternatives should be encouraged",
    "Plant based should be encouraged",
    "Meat alternatives should be invested in",
    "Plant based should be invested in",
    "Meat alternatives should be subsidized",
    "Plant based should be subsidized",
    "We should introduce meatless mondays",
    "Vegetarian and vegan diets should be encouraged",
    "Vegetarian and vegan diets should be discouraged",
    "We should subsidize fruits and vegetables",
    "We should encourage more fruits and vegetable consumption",
    "We should discourage fruits and vegetable consumption",
] 

tweet_queries_da = [
    "Vi bør stoppe med at subsidiere kød",
    "Vi bør subsidiere grøn ernæring",
    "Vi skal øge produktionen af kød",
    "Vi bør fritage kød produktion fra co2 afgifter",
    "Vi bør tilføje en co2 afgift til fødevare produktionen",
    "Alternativer til kød bør fremmes og støttes",
    "Plantebaseret bør fremmes og støttes",
    "Der bør investeres i kødalternativer",
    "Der bør investeres i plantebaseret",
    "Alternativer til kød bør subsidieres",
    "Plantebaseret bør subsidieres",
    "Der bør opfordres til vegetarisk og vegansk kost",
    "Vegetarisk og vegansk kost bør frarådes",
    "Vi bør subsidiere frugt og grøntsager",
    "Vi bør fremme forbruget af frugt og grøntsager",
    "Vi bør fraråde forbrug af frugt og grøntsager"
]

key_words_en = [
    "healthy food",
    "food",
    "green food",
    "veganism",
    "vegetable",
    "good recipe",
    "climate friendly recipe",
    "climate friendly diet",
    "healthy recipe",
    "sustainable diet",
    "green diet",
    "diet with vegetable",
    "vegetables are healthy",
    "fruit and vegetable",
    "fruit",
    "vegetarian",
    "vegan",
    "good vegan recipe",
    "good vegetarian recipe",
    "organic",
    "plant food is great",
    "fresh and organic is good",
    "varied and balanced diet",
    "beans",
    "sustainable meat",
    "legumes",
    "whole grains",
    "local farmers market",
    "plant based",
    "meat alternative",
    "plant based diet",
    "green food is really good",
    "animals are not ingredients",
    "eat healthy food",
    "raw food diet",
    "whole foods",
    "flexitarian",
    "raw foodism",
    "rawism",
]

key_words_da = [
    "sund mad",
    "mad",
    "grøn mad",
    "veganism",
    "vegansk",
    "vegetar",
    "veganer",
    "god opskrift",
    "klima venlig opskrift",
    "klima venlig kost",
    "sund opskrift",
    "grøntsags kost",
    "frugt",
    "frugt og grønt",
    "god vegetarisk opskrift",
    "god vegansk opskrift",
    "økologisk",
    "platebaseret er godt",
    "frisk økologisk mad",
    "bønner",
    "bælgplanter",
    "bælgfrugter",
    "kød alternativer",
    "dyr er ikke mad",
    "fuldkost",
]

filters = '-filter:retweets -filter:links -filter:quote -filter:videos' 
num_tweets_per_query = 1000
min_tweet_length = 3
token_mention = '<MENTION>'

In [5]:
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
nltk.download('punkt')
nltk.download('stopwords')

def generateQueries(queries, lang):
    nQueries = []
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    stops.update(['bør'])
    for q in queries:
        words = word_tokenize(q.lower())                        
        meaningful_words = [w for w in words if not w in stops]   
        words = ([stemmer.stem(w) for w in meaningful_words])
        
        for comb in itertools.combinations(meaningful_words, 2):
            nQueries.append(' '.join(comb))
        
    return nQueries

queries_da = [*generateQueries(tweet_queries_da, 'da'), *key_words_da]
queries_en = [*generateQueries(tweet_queries_en, 'en'), *key_words_en]

[nltk_data] Downloading package punkt to /home/mah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/mah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True)

In [7]:
def scrape_tweets(api, queries, lang):    
    tweet_dicts = []
    
    for query in queries:
        print("="*100)
        print('Scraping tweets for:', query)
        res = tweepy.Cursor(api.search_tweets, q = query + ' ' + filters, lang = lang, count = num_tweets_per_query, tweet_mode = 'extended').items(num_tweets_per_query)
        curLen = len(tweet_dicts)
        for i, t in enumerate(res):
            tweet_dicts.append({
                "tweet": re.sub('@[^\s]+', token_mention, t.full_text),
                "id": t.id,
                #"topic": topic,
                "lang": t.lang,
            })
        print('Scraped',len(tweet_dicts) - curLen, 'tweets')
        
        # The 12 seconds could maybe be calculated depeding on number of queries
        #sleep(12*(i+1-num_tweets_per_query)/num_tweets_per_query) # Sleep for 12 seconds, so that we don't do more than 180 request over 15 mins

    return pd.DataFrame(tweet_dicts)

In [8]:
res_da = scrape_tweets(api, queries_da, 'da')
res_en = scrape_tweets(api, queries_en, 'en')

Scraping tweets for: stoppe subsidiere


Rate limit reached. Sleeping for: 14


Scraped 0 tweets
Scraping tweets for: stoppe kød
Scraped 1 tweets
Scraping tweets for: subsidiere kød
Scraped 0 tweets
Scraping tweets for: subsidiere grøn
Scraped 0 tweets
Scraping tweets for: subsidiere ernæring
Scraped 0 tweets
Scraping tweets for: grøn ernæring
Scraped 0 tweets
Scraping tweets for: øge produktionen
Scraped 0 tweets
Scraping tweets for: øge kød
Scraped 1 tweets
Scraping tweets for: produktionen kød
Scraped 1 tweets
Scraping tweets for: fritage kød
Scraped 0 tweets
Scraping tweets for: fritage produktion
Scraped 0 tweets
Scraping tweets for: fritage co2
Scraped 0 tweets
Scraping tweets for: fritage afgifter
Scraped 0 tweets
Scraping tweets for: kød produktion
Scraped 2 tweets
Scraping tweets for: kød co2
Scraped 5 tweets
Scraping tweets for: kød afgifter
Scraped 4 tweets
Scraping tweets for: produktion co2
Scraped 8 tweets
Scraping tweets for: produktion afgifter
Scraped 1 tweets
Scraping tweets for: co2 afgifter
Scraped 1 tweets
Scraping tweets for: tilføje co2
Scra

Rate limit reached. Sleeping for: 830


Scraped 301 tweets
Scraping tweets for: meat production
Scraped 364 tweets
Scraping tweets for: meat carbon
Scraped 157 tweets
Scraping tweets for: meat taxes
Scraped 72 tweets
Scraping tweets for: production carbon
Scraped 516 tweets
Scraping tweets for: production taxes
Scraped 372 tweets
Scraping tweets for: carbon taxes
Scraped 1000 tweets
Scraping tweets for: add carbon
Scraped 269 tweets
Scraping tweets for: add tax
Scraped 1000 tweets
Scraping tweets for: add food
Scraped 1000 tweets
Scraping tweets for: add production
Scraped 570 tweets
Scraping tweets for: carbon tax
Scraped 1000 tweets
Scraping tweets for: carbon food
Scraped 626 tweets
Scraping tweets for: carbon production
Scraped 516 tweets
Scraping tweets for: tax food
Scraped 1000 tweets
Scraping tweets for: tax production
Scraped 718 tweets
Scraping tweets for: food production
Scraped 1000 tweets
Scraping tweets for: meat alternatives
Scraped 260 tweets
Scraping tweets for: meat encouraged
Scraped 27 tweets
Scraping twe

Rate limit reached. Sleeping for: 796


Scraped 12 tweets
Scraping tweets for: alternatives subsidized
Scraped 3 tweets
Scraping tweets for: plant based
Scraped 1000 tweets
Scraping tweets for: plant subsidized
Scraped 9 tweets
Scraping tweets for: based subsidized
Scraped 52 tweets
Scraping tweets for: introduce meatless
Scraped 1 tweets
Scraping tweets for: introduce mondays
Scraped 2 tweets
Scraping tweets for: meatless mondays
Scraped 18 tweets
Scraping tweets for: vegetarian vegan
Scraped 1000 tweets
Scraping tweets for: vegetarian diets
Scraped 70 tweets
Scraping tweets for: vegetarian encouraged
Scraped 7 tweets
Scraping tweets for: vegan diets
Scraped 262 tweets
Scraping tweets for: vegan encouraged
Scraped 10 tweets
Scraping tweets for: diets encouraged
Scraped 3 tweets
Scraping tweets for: vegetarian vegan
Scraped 1000 tweets
Scraping tweets for: vegetarian diets
Scraped 70 tweets
Scraping tweets for: vegetarian discouraged
Scraped 1 tweets
Scraping tweets for: vegan diets
Scraped 262 tweets
Scraping tweets for: ve

Rate limit reached. Sleeping for: 816


Scraped 271 tweets
Scraping tweets for: diet with vegetable
Scraped 26 tweets
Scraping tweets for: vegetables are healthy
Scraped 104 tweets
Scraping tweets for: fruit and vegetable
Scraped 367 tweets
Scraping tweets for: fruit
Scraped 1000 tweets
Scraping tweets for: vegetarian
Scraped 1000 tweets
Scraping tweets for: vegan
Scraped 1000 tweets
Scraping tweets for: good vegan recipe
Scraped 43 tweets
Scraping tweets for: good vegetarian recipe
Scraped 7 tweets
Scraping tweets for: organic
Scraped 1000 tweets
Scraping tweets for: plant food is great
Scraped 19 tweets
Scraping tweets for: fresh and organic is good
Scraped 8 tweets
Scraping tweets for: varied and balanced diet
Scraped 7 tweets
Scraping tweets for: beans
Scraped 1000 tweets
Scraping tweets for: sustainable meat
Scraped 167 tweets
Scraping tweets for: legumes
Scraped 626 tweets
Scraping tweets for: whole grains
Scraped 288 tweets
Scraping tweets for: local farmers market
Scraped 182 tweets
Scraping tweets for: plant based
S

In [9]:
def clean_df(df):
    df = df.drop_duplicates(subset = ['tweet', 'id'])
    df = df[df.tweet.str.split().str.len() > min_tweet_length]
    return df

res_da = clean_df(res_da)
res_en = clean_df(res_en)
res = pd.concat([res_da, res_en])
res.to_csv('scraped_tweets_new.csv', index=False)

In [12]:
res

Unnamed: 0,tweet,id,lang,score
0,"<MENTION> <MENTION> Eneste grund til, at man i...",1509762775526125637,da,0
1,<MENTION> <MENTION> <MENTION> Det har intet me...,1511046007685062662,da,0
2,<MENTION> <MENTION> <MENTION> Forsimplet versi...,1511250737770213378,da,0
4,<MENTION> Man kunne også bare skære ned på kød...,1509829822821941266,da,0
5,<MENTION> <MENTION> Det bør vi i min optik ogs...,1511970210428952577,da,0
...,...,...,...,...
40135,Enquiring minds want to know! Which one are yo...,1509200044775272448,en,0
40136,Got asked if I'm a veggie/vegan yesterday and ...,1509167950976147471,en,0
40137,Today's flexitarian menu energetically flaunts...,1509156884988628994,en,0
40138,<MENTION> I went flexitarian- it’s much easier...,1509144494737870848,en,0


In [11]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])


def score(df):
    df = df.copy()
    for topic in df.topic.unique():
        print('Calculating scores for:', topic)
        lang = df[df.topic == topic].lang.values[0]
        corpus = list(map(lambda x: pre(x, lang),df[df.topic == topic].tweet.values))

        vectorizer = TfidfVectorizer()
        tfidf_mtx = vectorizer.fit_transform(corpus)
        topic_mtx = vectorizer.transform([pre(topic, lang)])
        
        cos_score = cosine_similarity(tfidf_mtx, topic_mtx)
        
        df.loc[df.topic == topic, ['score']] = cos_score
    print('Done')
    return df

scored = score(res)

        

AttributeError: 'DataFrame' object has no attribute 'topic'

In [None]:
top = scored.sort_values(by=['score'], ascending=False)

In [None]:
i = 0
for r in top.head(i+10).iloc[i:].iloc:
    print('Topic:', r.topic, 'Tweet:', r.tweet)
    print('='*120)