In [2]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
from sentence_transformers import SentenceTransformer, util



In [26]:
min_tweet_length = 10


topics_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should exempt meat production from carbon taxes",
    "We should add a carbon tax to food production",
    "Meat alternatives should be encouraged",
    "Plant based diets should be encouraged",
    "Meat alternatives should be invested in",
    "We should invest in more plant based food production",
    "Meat alternatives should be subsidized",
    "Plant based food should be subsidized",
    "We should introduce meatless mondays",
    "Vegetarian and vegan diets should be encouraged",
    "Vegetarian and vegan diets should be discouraged",
    "We should subsidize fruits and vegetables",
    "We should encourage more fruits and vegetable consumption",
    "We should discourage fruits and vegetable consumption",
] 

topics_da = [
    "Vi b√∏r stoppe med at subsidiere k√∏d",
    "Vi b√∏r subsidiere gr√∏n ern√¶ring",
    "Vi skal √∏ge produktionen af k√∏d",
    "Vi b√∏r fritage k√∏d produktion fra co2 afgifter",
    "Vi b√∏r tilf√∏je en co2 afgift til f√∏devare produktionen",
    "Alternativer til k√∏d b√∏r fremmes og st√∏ttes",
    "Plantebaseret b√∏r fremmes og st√∏ttes",
    "Der b√∏r investeres i k√∏dalternativer",
    "Der b√∏r investeres i plantebaseret",
    "Alternativer til k√∏d b√∏r subsidieres",
    "Plantebaseret b√∏r subsidieres",
    "Der b√∏r opfordres til vegetarisk og vegansk kost",
    "Vegetarisk og vegansk kost b√∏r frar√•des",
    "Vi b√∏r subsidiere frugt og gr√∏ntsager",
    "Vi b√∏r fremme forbruget af frugt og gr√∏ntsager",
    "Vi b√∏r frar√•de forbrug af frugt og gr√∏ntsager"
]

tweet_df = pd.read_csv('scraped_tweets_new.csv')
tweet_en_df = tweet_df[tweet_df.lang == 'en']                
                
annotated_samples = pd.read_csv('sample_annotated-v4.csv')



tweet_en_df = tweet_en_df.loc[~tweet_en_df.id.isin(annotated_samples.id)]


In [27]:
def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])

def filter_tweets(df):
    return df[df.tweet.str.split().str.len() > min_tweet_length]

def score_tfidf(df, topic, lang):
    df = filter_tweets(df).copy()
    df['score'] = [0]*len(df) 

    corpus = list(map(lambda x: pre(x, lang), df[df.lang == lang].tweet.values))

    vectorizer = TfidfVectorizer()
    tfidf_mtx = vectorizer.fit_transform(corpus)
    topic_mtx = vectorizer.transform([pre(topic, lang)])

    cos_score = cosine_similarity(tfidf_mtx, topic_mtx)

    df.loc[df.lang == lang, ['score']] = cos_score
    return df

#scored = score_tfidf(tweet_df, topics_en[9], 'en')

#top = scored.sort_values(by=['score'], ascending=False)

In [28]:
def print_df_scores(df):
    top = df.sort_values(by=['score'], ascending=False)
    i = 0
    for r in top.head(i+10).iloc[i:].iloc:
        print('Topic:', r.topic, 'Tweet:', r.tweet, 'Score:', r.score)
        print('='*120)

In [29]:

def qa_model_score(df, topics, dot_score = True):    
    df = df.copy()
    df['score'] = [0]*len(df) 
    
    #Load the model
    model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

    docs = df.tweet.values
    doc_emb = model.encode(docs)

    
    score_dfs = [] 
    for topic in topics:
    
        df['score'] = [0]*len(df) 
        df['topic'] = [topic]*len(df)
    
        #Encode query and documents
        query_emb = model.encode(topic)
        
        if dot_score:
            #Compute dot score between query and all document embeddings
            scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        else:
            scores = cosine_similarity(query_emb, doc_emb)[0].tolist()
            
        #Combine docs & scores
        #doc_score_pairs = list(zip(docs, scores))
        df['score'] = scores
        score_dfs.append(df.copy())

        #Sort by decreasing score
        #doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    return score_dfs
        


In [30]:
scores = qa_model_score(tweet_en_df, topics_en)

In [31]:
def top_tweets(dfs, top = 100):
    top_dfs = []
    for df in dfs:
        top_dfs.append(df.sort_values(by=['score'], ascending=False)[:top])
    return pd.concat(top_dfs)

In [32]:
top_df = top_tweets(scores)

In [33]:
print_df_scores(top_df)

Topic: Meat alternatives should be encouraged Tweet: in all seriousness meat alternatives are great and should be used within moderation üëçüèª Score: 0.9079667925834656
Topic: Plant based diets should be encouraged Tweet: Plant based diet really is the way Score: 0.901177704334259
Topic: We should introduce meatless mondays Tweet: <MENTION> expand meatless Mondays Score: 0.8971047401428223
Topic: We should introduce meatless mondays Tweet: <MENTION> <MENTION> <MENTION> Meatless Mondays! üòÇüî• Score: 0.8724989295005798
Topic: Meat alternatives should be encouraged Tweet: <MENTION> Where can I start seeing about meat alternatives? Score: 0.8672292828559875
Topic: Plant based diets should be encouraged Tweet: <MENTION> GO PLANT BASED DIET‚Ä¶..HEALTHIER Score: 0.8573483824729919
Topic: Plant based diets should be encouraged Tweet: Definitely interested in a plant based diet Score: 0.8481646776199341
Topic: Vegetarian and vegan diets should be encouraged Tweet: <MENTION> Vegetarian wi

In [47]:
sample = tweet_en_df.sample(25)

In [48]:
sample['topic'] = np.random.choice(topics_en, len(sample))
sample['score'] = [0]*len(sample)

In [49]:
sample = pd.concat([sample, top_df.sample(100)])
sample = sample.loc[:, ~sample.columns.str.contains('^Unnamed')]
sample

Unnamed: 0,tweet,id,lang,topic,score
11424,<MENTION> <MENTION> <MENTION> The major food g...,1.511022e+18,en,We should subsidize fruits and vegetables,0.000000
22262,"Ok Paula's ""recipe"" for cooking fried chicken ...",1.511974e+18,en,We should subsidize green nutrition,0.000000
17520,<MENTION> When my parents force-fed me healthy...,1.511999e+18,en,Vegetarian and vegan diets should be encouraged,0.000000
4292,<MENTION> why? havent you hear about meat rais...,1.511450e+18,en,Plant based diets should be encouraged,0.000000
10180,<MENTION> <MENTION> Anybody have any idea what...,1.510963e+18,en,Plant based food should be subsidized,0.000000
...,...,...,...,...,...
16699,I don‚Äôt like fruits and vegetables in general,1.511504e+18,en,We should subsidize fruits and vegetables,0.553105
1978,<MENTION> <MENTION> Yep. I like meat but want ...,1.511401e+18,en,We should introduce meatless mondays,0.486321
13118,dude there are so many good and legitimate rea...,1.510837e+18,en,Meat alternatives should be encouraged,0.710040
30781,I have no problem with one choosing a plant ba...,1.509767e+18,en,Plant based food should be subsidized,0.571025


In [50]:
def add_labels(df):
    sLength = len(df)
    df = df.assign(argumentative=pd.Series(['0']*sLength).values)
    df = df.assign(evidence=pd.Series(np.zeros(sLength)).values)
    df = df.assign(claim=pd.Series(np.zeros(sLength)).values)
    df = df.assign(procon=pd.Series(np.zeros(sLength)).values)
    df = df.assign(arg_type=pd.Series(['']*sLength).values)
    df = df.assign(evidence_type=pd.Series(['']*sLength).values)
    df = df.reset_index()
    return df

sample = add_labels(sample)
sample

Unnamed: 0,index,tweet,id,lang,topic,score,argumentative,evidence,claim,procon,arg_type,evidence_type
0,11424,<MENTION> <MENTION> <MENTION> The major food g...,1.511022e+18,en,We should subsidize fruits and vegetables,0.000000,0,0.0,0.0,0.0,,
1,22262,"Ok Paula's ""recipe"" for cooking fried chicken ...",1.511974e+18,en,We should subsidize green nutrition,0.000000,0,0.0,0.0,0.0,,
2,17520,<MENTION> When my parents force-fed me healthy...,1.511999e+18,en,Vegetarian and vegan diets should be encouraged,0.000000,0,0.0,0.0,0.0,,
3,4292,<MENTION> why? havent you hear about meat rais...,1.511450e+18,en,Plant based diets should be encouraged,0.000000,0,0.0,0.0,0.0,,
4,10180,<MENTION> <MENTION> Anybody have any idea what...,1.510963e+18,en,Plant based food should be subsidized,0.000000,0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
120,16699,I don‚Äôt like fruits and vegetables in general,1.511504e+18,en,We should subsidize fruits and vegetables,0.553105,0,0.0,0.0,0.0,,
121,1978,<MENTION> <MENTION> Yep. I like meat but want ...,1.511401e+18,en,We should introduce meatless mondays,0.486321,0,0.0,0.0,0.0,,
122,13118,dude there are so many good and legitimate rea...,1.510837e+18,en,Meat alternatives should be encouraged,0.710040,0,0.0,0.0,0.0,,
123,30781,I have no problem with one choosing a plant ba...,1.509767e+18,en,Plant based food should be subsidized,0.571025,0,0.0,0.0,0.0,,


In [60]:
sample_full = pd.concat([sample, annotated_samples])

In [61]:
sample_full[sample_full.duplicated(subset='tweet')]

Unnamed: 0,index,tweet,id,lang,topic,score,argumentative,evidence,claim,procon,arg_type,evidence_type
97,3618,<MENTION> <MENTION> <MENTION> But no one of us...,1.511637e+18,en,We should stop subsidizing meat,0.671986,0,0.0,0.0,0.0,,
38,38,<MENTION> <MENTION> <MENTION> <MENTION> <MENTI...,1.511652e+18,en,Meat alternatives should be invested in,0.746783,1,1.0,1.0,1.0,implicit,fact
48,48,<MENTION> why? havent you hear about meat rais...,1.51145e+18,en,We have to increase production of meat,0.708149,1,1.0,1.0,-1.0,implicit,fact
51,51,<MENTION> Great reason to switch to vegetarian.,1.511911e+18,en,Vegetarian and vegan diets should be discouraged,0.744728,lacks context,0.0,0.0,0.0,,
57,57,<MENTION> <MENTION> <MENTION> Switch to plant ...,1.51166e+18,en,We should encourage more fruits and vegetable ...,0.633157,1,1.0,1.0,1.0,explicit,fact
70,70,<MENTION> Why are their fruits on a vegetable ...,1.51177e+18,en,We should discourage fruits and vegetable cons...,0.619539,0,0.0,0.0,0.0,,


In [62]:
# Sample bit shuffles the set
sample_full.sample(frac=1).to_csv('full_sample.csv',index=False)