In [None]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json

In [None]:
# no tweets will be found for a date older than one week.
# https://docs.tweepy.org/en/stable/api.html#search-tweets


# this one needs academic access to twitter apiu
#https://docs.tweepy.org/en/stable/client.html#search-tweets


### Parameters

In [None]:

# Twitter stuff

credentials_path = './credentials.json'

with io.open(credentials_path) as f_in:
    credentials = json.load(f_in)


access_token = credentials["access_token"]
access_token_secret = credentials["access_token_secret"]


api_key = credentials["api_key"]
api_secret = credentials["api_secret"]
bearer_token = credentials["bearer_token"]

consumer_key = api_key
consumer_secret = api_secret

df_path = './tweets.csv'

In [None]:
tweet_queries_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should exempt meat production from carbon taxes",
    "We should add a carbon tax to food production",
    "Meat alternatives should be encouraged",
    "Plant based should be encouraged",
    "Meat alternatives should be invested in",
    "Plant based should be invested in",
    "Meat alternatives should be subsidized",
    "Plant based should be subsidized",
    "We should introduce meatless mondays",
    "Vegetarian and vegan diets should be encouraged",
    "Vegetarian and vegan diets should be discouraged",
    "We should subsidize fruits and vegetables",
    "We should encourage more fruits and vegetable consumption",
    "We should discourage fruits and vegetable consumption",
] 

tweet_queries_da = [
    "Vi bør stoppe med at subsidiere kød",
    "Vi bør subsidiere grøn ernæring",
    "Vi skal øge produktionen af kød",
    "Vi bør fritage kød produktion fra co2 afgifter",
    "Vi bør tilføje en co2 afgift til fødevare produktionen",
    "Alternativer til kød bør fremmes og støttes",
    "Plantebaseret bør fremmes og støttes",
    "Der bør investeres i kødalternativer",
    "Der bør investeres i plantebaseret",
    "Alternativer til kød bør subsidieres",
    "Plantebaseret bør subsidieres",
    "Der bør opfordres til vegetarisk og vegansk kost",
    "Vegetarisk og vegansk kost bør frarådes",
    "Vi bør subsidiere frugt og grøntsager",
    "Vi bør fremme forbruget af frugt og grøntsager",
    "Vi bør fraråde forbrug af frugt og grøntsager"
]

key_words_en = [
    "healthy food",
    "food",
    "green food",
    "veganism",
    "vegetable",
    "good recipe",
    "climate friendly recipe",
    "climate friendly diet",
    "healthy recipe",
    "sustainable diet",
    "green diet",
    "diet with vegetable",
    "vegetables are healthy",
    "fruit and vegetable",
    "fruit",
    "vegetarian",
    "vegan",
    "good vegan recipe",
    "good vegetarian recipe",
    "organic",
    "plant food is great",
    "fresh and organic is good",
    "varied and balanced diet",
    "beans",
    "sustainable meat",
    "legumes",
    "whole grains",
    "local farmers market",
    "plant based",
    "meat alternative",
    "plant based diet",
    "green food is really good",
    "animals are not ingredients",
    "eat healthy food",
    "raw food diet",
    "whole foods",
    "flexitarian",
    "raw foodism",
    "rawism",
]

key_words_da = [
    "sund mad",
    "mad",
    "grøn mad",
    "veganism",
    "vegansk",
    "vegetar",
    "veganer",
    "god opskrift",
    "klima venlig opskrift",
    "klima venlig kost",
    "sund opskrift",
    "grøntsags kost",
    "frugt",
    "frugt og grønt",
    "god vegetarisk opskrift",
    "god vegansk opskrift",
    "økologisk",
    "platebaseret er godt",
    "frisk økologisk mad",
    "bønner",
    "bælgplanter",
    "bælgfrugter",
    "kød alternativer",
    "dyr er ikke mad",
    "fuldkost",
]

filters = '-filter:retweets -filter:links -filter:quote -filter:videos' 
num_tweets_per_query = 1000
min_tweet_length = 3
token_mention = '<MENTION>'

In [None]:
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
nltk.download('punkt')
nltk.download('stopwords')

def generateQueries(queries, lang):
    nQueries = []
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    stops.update(['bør'])
    for q in queries:
        words = word_tokenize(q.lower())                        
        meaningful_words = [w for w in words if not w in stops]   
        words = ([stemmer.stem(w) for w in meaningful_words])
        
        for comb in itertools.combinations(meaningful_words, 2):
            nQueries.append(' '.join(comb))
        
    return nQueries

queries_da = [*generateQueries(tweet_queries_da, 'da'), *key_words_da]
queries_en = [*generateQueries(tweet_queries_en, 'en'), *key_words_en]

In [None]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit = True)

In [None]:
def scrape_tweets(api, queries, lang):    
    tweet_dicts = []
    
    for query in queries:
        print("="*100)
        print('Scraping tweets for:', query)
        res = tweepy.Cursor(api.search_tweets, q = query + ' ' + filters, lang = lang, count = num_tweets_per_query, tweet_mode = 'extended').items(num_tweets_per_query)
        curLen = len(tweet_dicts)
        for i, t in enumerate(res):
            tweet_dicts.append({
                "tweet": re.sub('@[^\s]+', token_mention, t.full_text),
                "id": t.id,
                #"topic": topic,
                "lang": t.lang,
            })
        print('Scraped',len(tweet_dicts) - curLen, 'tweets')
        
        # The 12 seconds could maybe be calculated depeding on number of queries
        #sleep(12*(i+1-num_tweets_per_query)/num_tweets_per_query) # Sleep for 12 seconds, so that we don't do more than 180 request over 15 mins

    return pd.DataFrame(tweet_dicts)

In [None]:
res_da = scrape_tweets(api, queries_da, 'da')
res_en = scrape_tweets(api, queries_en, 'en')

In [None]:
def clean_df(df):
    df = df.drop_duplicates(subset = ['tweet', 'id'])
    df = df[df.tweet.str.split().str.len() > min_tweet_length]
    return df

res_da = clean_df(res_da)
res_en = clean_df(res_en)
res = pd.concat([res_da, res_en])
res.to_csv('scraped_tweets_new.csv', index=False)

In [None]:
res

In [None]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])


def score(df):
    df = df.copy()
    for topic in df.topic.unique():
        print('Calculating scores for:', topic)
        lang = df[df.topic == topic].lang.values[0]
        corpus = list(map(lambda x: pre(x, lang),df[df.topic == topic].tweet.values))

        vectorizer = TfidfVectorizer()
        tfidf_mtx = vectorizer.fit_transform(corpus)
        topic_mtx = vectorizer.transform([pre(topic, lang)])
        
        cos_score = cosine_similarity(tfidf_mtx, topic_mtx)
        
        df.loc[df.topic == topic, ['score']] = cos_score
    print('Done')
    return df

scored = score(res)

        

In [None]:
top = scored.sort_values(by=['score'], ascending=False)

In [None]:
i = 0
for r in top.head(i+10).iloc[i:].iloc:
    print('Topic:', r.topic, 'Tweet:', r.tweet)
    print('='*120)