In [1]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
from sentence_transformers import SentenceTransformer, util



In [12]:
min_tweet_length = 10


topics_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should exempt meat production from carbon taxes",
    "We should add a carbon tax to food production",
    "Meat alternatives should be encouraged",
    "Plant based should be encouraged",
    "Meat alternatives should be invested in",
    "Plant based should be invested in",
    "Meat alternatives should be subsidized",
    "Plant based should be subsidized",
    "We should introduce meatless mondays",
    "Vegetarian and vegan diets should be encouraged",
    "Vegetarian and vegan diets should be discouraged",
    "We should subsidize fruits and vegetables",
    "We should encourage more fruits and vegetable consumption",
    "We should discourage fruits and vegetable consumption",
] 

topics_da = [
    "Vi bør stoppe med at subsidiere kød",
    "Vi bør subsidiere grøn ernæring",
    "Vi skal øge produktionen af kød",
    "Vi bør fritage kød produktion fra co2 afgifter",
    "Vi bør tilføje en co2 afgift til fødevare produktionen",
    "Alternativer til kød bør fremmes og støttes",
    "Plantebaseret bør fremmes og støttes",
    "Der bør investeres i kødalternativer",
    "Der bør investeres i plantebaseret",
    "Alternativer til kød bør subsidieres",
    "Plantebaseret bør subsidieres",
    "Der bør opfordres til vegetarisk og vegansk kost",
    "Vegetarisk og vegansk kost bør frarådes",
    "Vi bør subsidiere frugt og grøntsager",
    "Vi bør fremme forbruget af frugt og grøntsager",
    "Vi bør fraråde forbrug af frugt og grøntsager"
]

tweet_df = pd.read_csv('scraped_tweets_new.csv')
top_df = pd.read_csv('scored_top_100_tweets.csv')

In [8]:
def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])

def filter_tweets(df):
    return df[df.tweet.str.split().str.len() > min_tweet_length]

def score_tfidf(df, topic, lang):
    df = filter_tweets(df).copy()
    df['score'] = [0]*len(df) 

    corpus = list(map(lambda x: pre(x, lang), df[df.lang == lang].tweet.values))

    vectorizer = TfidfVectorizer()
    tfidf_mtx = vectorizer.fit_transform(corpus)
    topic_mtx = vectorizer.transform([pre(topic, lang)])

    cos_score = cosine_similarity(tfidf_mtx, topic_mtx)

    df.loc[df.lang == lang, ['score']] = cos_score
    return df

#scored = score_tfidf(tweet_df, topics_en[9], 'en')

#top = scored.sort_values(by=['score'], ascending=False)

In [9]:
def print_df_scores(df):
    top = df.sort_values(by=['score'], ascending=False)
    i = 0
    for r in top.head(i+10).iloc[i:].iloc:
        print('Topic:', r.topic, 'Tweet:', r.tweet, 'Score:', r.score)
        print('='*120)

In [10]:

def qa_model_score(df, topics, dot_score = True):    
    df = df.copy()
    df['score'] = [0]*len(df) 
    
    #Load the model
    model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

    docs = df.tweet.values
    doc_emb = model.encode(docs)

    
    score_dfs = [] 
    for topic in topics:
    
        df['score'] = [0]*len(df) 
        df['topic'] = [topic]*len(df)
    
        #Encode query and documents
        query_emb = model.encode(topic)
        
        if dot_score:
            #Compute dot score between query and all document embeddings
            scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        else:
            scores = cosine_similarity(query_emb, doc_emb)[0].tolist()
            
        #Combine docs & scores
        #doc_score_pairs = list(zip(docs, scores))
        df['score'] = scores
        score_dfs.append(df.copy())

        #Sort by decreasing score
        #doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    return score_dfs
        


In [6]:
scores = qa_model_score(tweet_df, topics_en)

In [11]:
def top_tweets(dfs, top = 100):
    top_dfs = []
    for df in dfs:
        top_dfs.append(df.sort_values(by=['score'], ascending=False)[:top])
    return pd.concat(top_dfs)

In [14]:
#top_tweets(scores).to_csv('scored_top_100_tweets.csv')

In [13]:
print_df_scores(top_df)

Topic: Meat alternatives should be encouraged Tweet: in all seriousness meat alternatives are great and should be used within moderation 👍🏻 Score: 0.9079667925834656
Topic: We should introduce meatless mondays Tweet: <MENTION> expand meatless Mondays Score: 0.8971047401428223
Topic: We should introduce meatless mondays Tweet: <MENTION> <MENTION> <MENTION> Meatless Mondays! 😂🔥 Score: 0.8724989295005798
Topic: Meat alternatives should be encouraged Tweet: <MENTION> Where can I start seeing about meat alternatives? Score: 0.8672293424606323
Topic: Vegetarian and vegan diets should be encouraged Tweet: <MENTION> Vegetarian with vegan options. Score: 0.8469443917274475
Topic: Vegetarian and vegan diets should be discouraged Tweet: <MENTION> not vegan VEGETARIAN Score: 0.8436784744262695
Topic: Vegetarian and vegan diets should be encouraged Tweet: <MENTION> not vegan VEGETARIAN Score: 0.8409231901168823
Topic: Meat alternatives should be invested in Tweet: <MENTION> Where can I start seeing

In [14]:
sample = tweet_df[tweet_df.lang == 'en'].sample(25)

In [15]:
sample['topic'] = np.random.choice(topics_en, len(sample))
sample['score'] = [0]*len(sample)

In [16]:
def random_top_samples(dfs, samples, top = 100):
    top_dfs = []
    for df in dfs:
        top_dfs.append(df.sort_values(by=['score'], ascending=False)[:top])
    return pd.concat(top_dfs).sample(samples)

In [23]:
sample = pd.concat([sample, top_df.sample(25)])
sample = sample.loc[:, ~sample.columns.str.contains('^Unnamed')]
sample

Unnamed: 0,tweet,id,lang,topic,score
20015,"When u rent shit out, u still need permission ...",1.511039e+18,en,We should exempt meat production from carbon t...,0.000000
4610,<MENTION> <MENTION> 2/2 the cattle after all a...,1.511298e+18,en,We should exempt meat production from carbon t...,0.000000
3703,<MENTION> Could you share the literature which...,1.511066e+18,en,We should encourage more fruits and vegetable ...,0.000000
32725,<MENTION> <MENTION> <MENTION> <MENTION> Lol - ...,1.511469e+18,en,Meat alternatives should be encouraged,0.000000
29357,Finished 3 hours of condensing Mediterranean D...,1.510103e+18,en,We should introduce meatless mondays,0.000000
...,...,...,...,...,...
1667,<MENTION> Why are their fruits on a vegetable ...,1.511770e+18,en,We should discourage fruits and vegetable cons...,0.619539
1265,<MENTION> <MENTION> How did the topic of veget...,1.511408e+18,en,Vegetarian and vegan diets should be encouraged,0.746736
552,gonna buy some meat alternatives at the vegan ...,1.510255e+18,en,Meat alternatives should be encouraged,0.724277
565,<MENTION> <MENTION> <MENTION> I mostly get mea...,1.511749e+18,en,Meat alternatives should be encouraged,0.711584


In [25]:
sample.to_csv('sample.csv',index=False)