In [1]:
import pandas as pd
import numpy as np
import tweepy
import nltk
import re
from time import sleep
import io
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords # Import the stop word list
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
import itertools
from sentence_transformers import SentenceTransformer, util



In [2]:
min_tweet_length = 10


topics_en = [
    "We should stop subsidizing meat",
    "We should subsidize green nutrition",
    "We have to increase production of meat",
    "We should exempt meat production from carbon taxes",
    "We should add a carbon tax to food production",
    "Meat alternatives should be encouraged",
    "Plant based diets should be encouraged",
    "Meat alternatives should be invested in",
    "We should invest in more plant based food production",
    "Meat alternatives should be subsidized",
    "Plant based food should be subsidized",
    "We should introduce meatless mondays",
    "Vegetarian and vegan diets should be encouraged",
    "Vegetarian and vegan diets should be discouraged",
    "We should subsidize fruits and vegetables",
    "We should encourage more fruits and vegetable consumption",
    "We should discourage fruits and vegetable consumption",
] 

topics_en_v2 = [
    "We should reduce the consumption of meat",
    "Plant based food should be encouraged",
    "Meat alternatives should be encouraged",
    "Vegan and vegetarian diets should be encouraged",
]

topics_da = [
    "Vi bør stoppe med at subsidiere kød",
    "Vi bør subsidiere grøn ernæring",
    "Vi skal øge produktionen af kød",
    "Vi bør fritage kød produktion fra co2 afgifter",
    "Vi bør tilføje en co2 afgift til fødevare produktionen",
    "Alternativer til kød bør fremmes og støttes",
    "Plantebaseret bør fremmes og støttes",
    "Der bør investeres i kødalternativer",
    "Der bør investeres i plantebaseret",
    "Alternativer til kød bør subsidieres",
    "Plantebaseret bør subsidieres",
    "Der bør opfordres til vegetarisk og vegansk kost",
    "Vegetarisk og vegansk kost bør frarådes",
    "Vi bør subsidiere frugt og grøntsager",
    "Vi bør fremme forbruget af frugt og grøntsager",
    "Vi bør fraråde forbrug af frugt og grøntsager"
]

tweet_df = pd.read_csv('scraped_tweets_new.csv')
tweet_en_df = tweet_df[tweet_df.lang == 'en']                
                
annotated_samples = pd.read_csv('sample_annotated-v4.csv')



tweet_en_df = tweet_en_df.loc[~tweet_en_df.id.isin(annotated_samples.id)]


In [3]:
def pre(string, lang):
    stemmer = SnowballStemmer('english' if lang == 'en' else 'danish')
    stops = set(stopwords.words('english' if lang == 'en' else 'danish'))
    
    words = word_tokenize(re.sub('[^a-zA-Z]', ' ', string.lower().strip()))                        
    meaningful_words = [w for w in words if not w in stops]   
    return ' '.join([stemmer.stem(w) for w in meaningful_words])

def filter_tweets(df):
    return df[df.tweet.str.split().str.len() > min_tweet_length]

def score_tfidf(df, topic, lang):
    df = filter_tweets(df).copy()
    df['score'] = [0]*len(df) 

    corpus = list(map(lambda x: pre(x, lang), df[df.lang == lang].tweet.values))

    vectorizer = TfidfVectorizer()
    tfidf_mtx = vectorizer.fit_transform(corpus)
    topic_mtx = vectorizer.transform([pre(topic, lang)])

    cos_score = cosine_similarity(tfidf_mtx, topic_mtx)

    df.loc[df.lang == lang, ['score']] = cos_score
    return df

#scored = score_tfidf(tweet_df, topics_en[9], 'en')

#top = scored.sort_values(by=['score'], ascending=False)

In [4]:
def print_df_scores(df):
    top = df.sort_values(by=['score'], ascending=False)
    i = 0
    for r in top.head(i+10).iloc[i:].iloc:
        print('Topic:', r.topic, 'Tweet:', r.tweet, 'Score:', r.score)
        print('='*120)

In [5]:

def qa_model_score(df, topics, dot_score = True):    
    df = df.copy()
    df['score'] = [0]*len(df) 
    
    #Load the model
    model = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-cos-v1')

    docs = df.tweet.values
    doc_emb = model.encode(docs)

    
    score_dfs = [] 
    for topic in topics:
    
        df['score'] = [0]*len(df) 
        df['topic'] = [topic]*len(df)
    
        #Encode query and documents
        query_emb = model.encode(topic)
        
        if dot_score:
            #Compute dot score between query and all document embeddings
            scores = util.dot_score(query_emb, doc_emb)[0].cpu().tolist()
        else:
            scores = cosine_similarity(query_emb, doc_emb)[0].tolist()
            
        #Combine docs & scores
        #doc_score_pairs = list(zip(docs, scores))
        df['score'] = scores
        score_dfs.append(df.copy())

        #Sort by decreasing score
        #doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
    return score_dfs
        


In [6]:
scores = qa_model_score(tweet_en_df, topics_en_v2)

In [7]:
def top_tweets(dfs, top = 100):
    top_dfs = []
    for df in dfs:
        top_dfs.append(df.sort_values(by=['score'], ascending=False)[:top])
    return pd.concat(top_dfs)

In [8]:
top_df = top_tweets(scores, 1000)

In [9]:
print_df_scores(top_df)

Topic: Meat alternatives should be encouraged Tweet: in all seriousness meat alternatives are great and should be used within moderation 👍🏻 Score: 0.9079667925834656
Topic: Meat alternatives should be encouraged Tweet: <MENTION> Where can I start seeing about meat alternatives? Score: 0.8672292828559875
Topic: Vegan and vegetarian diets should be encouraged Tweet: <MENTION> not vegan VEGETARIAN Score: 0.8405965566635132
Topic: Vegan and vegetarian diets should be encouraged Tweet: <MENTION> Vegetarian with vegan options. Score: 0.8403663635253906
Topic: Vegan and vegetarian diets should be encouraged Tweet: <MENTION> Cos your vegan? Vegetarian? Score: 0.8289363384246826
Topic: Vegan and vegetarian diets should be encouraged Tweet: <MENTION> No vegetarian or vegan option? Score: 0.8258136510848999
Topic: Meat alternatives should be encouraged Tweet: <MENTION> don’t get me wrong, it can be good but the meat alternatives are just not it. Score: 0.8256579637527466
Topic: Vegan and vegetari

In [16]:
#top_df = top_df.drop(columns='topic')
top_df = top_df.drop_duplicates(subset=['tweet'])

In [17]:
top_df

Unnamed: 0,tweet,id,lang,score
1492,<MENTION> We should all stop eating meat. Pigs...,1.511762e+18,en,0.791134
3706,<MENTION> Don't worry mr.mayor. If the consump...,1.511052e+18,en,0.778970
28880,<MENTION> We should eat meat from sustainable ...,1.511379e+18,en,0.755551
1595,<MENTION> <MENTION> We're not going to persuad...,1.511689e+18,en,0.754210
3776,<MENTION> <MENTION> We waste more than 25% of ...,1.510384e+18,en,0.738780
...,...,...,...,...
25729,<MENTION> <MENTION> very happy (generally) non...,1.511872e+18,en,0.643710
26016,<MENTION> <MENTION> You can be a vegetarian mo...,1.511797e+18,en,0.643620
20841,"<MENTION> <MENTION> I'm genuinely not sure, I'...",1.511590e+18,en,0.643615
15471,<MENTION> <MENTION> Vegetarian/vegan does not ...,1.511272e+18,en,0.643347


In [25]:
sample = tweet_en_df.sample(len(top_df))
#sample['topic'] = np.random.choice(topics_en, len(sample))
sample['score'] = [0]*len(sample)
sample = pd.concat([sample, top_df])
sample = sample.loc[:, ~sample.columns.str.contains('^Unnamed')]
sample = sample.drop_duplicates(subset=['tweet'])
sample = sample.sample(1000)
sample

Unnamed: 0,tweet,id,lang,score
2145,"Hindus must do one thing,just stop eating hala...",1.511305e+18,en,0.617665
1743,<MENTION> You can put full stop for eating mea...,1.511574e+18,en,0.656638
16679,<MENTION> Plenty of fruits and vegetables to c...,1.511517e+18,en,0.637276
6847,Dear <MENTION> we are getting 2 tax invoices f...,1.511586e+18,en,0.000000
22616,<MENTION> <MENTION> Haha they taste too good t...,1.511658e+18,en,0.000000
...,...,...,...,...
14162,<MENTION> Meat eating is highly immoral ecolog...,1.511614e+18,en,0.613317
32240,<MENTION> Looks like Whole Foods.,1.511758e+18,en,0.000000
12273,<MENTION> <MENTION> <MENTION> <MENTION> I stil...,1.511729e+18,en,0.515951
30704,<MENTION> That's because a plant-based diet pu...,1.509992e+18,en,0.561473


In [26]:
def add_labels(df):
    sLength = len(df)
    df = df.assign(argumentative=pd.Series(['0']*sLength).values)
    df = df.assign(evidence=pd.Series(np.zeros(sLength)).values)
    df = df.assign(claim=pd.Series(np.zeros(sLength)).values)
    df = df.assign(procon=pd.Series(np.zeros(sLength)).values)
    df = df.assign(arg_type=pd.Series(['']*sLength).values)
    df = df.assign(evidence_type=pd.Series(['']*sLength).values)
    df = df.reset_index()
    return df

sample = add_labels(sample)
sample

Unnamed: 0,index,tweet,id,lang,score,argumentative,evidence,claim,procon,arg_type,evidence_type
0,2145,"Hindus must do one thing,just stop eating hala...",1.511305e+18,en,0.617665,0,0.0,0.0,0.0,,
1,1743,<MENTION> You can put full stop for eating mea...,1.511574e+18,en,0.656638,0,0.0,0.0,0.0,,
2,16679,<MENTION> Plenty of fruits and vegetables to c...,1.511517e+18,en,0.637276,0,0.0,0.0,0.0,,
3,6847,Dear <MENTION> we are getting 2 tax invoices f...,1.511586e+18,en,0.000000,0,0.0,0.0,0.0,,
4,22616,<MENTION> <MENTION> Haha they taste too good t...,1.511658e+18,en,0.000000,0,0.0,0.0,0.0,,
...,...,...,...,...,...,...,...,...,...,...,...
995,14162,<MENTION> Meat eating is highly immoral ecolog...,1.511614e+18,en,0.613317,0,0.0,0.0,0.0,,
996,32240,<MENTION> Looks like Whole Foods.,1.511758e+18,en,0.000000,0,0.0,0.0,0.0,,
997,12273,<MENTION> <MENTION> <MENTION> <MENTION> I stil...,1.511729e+18,en,0.515951,0,0.0,0.0,0.0,,
998,30704,<MENTION> That's because a plant-based diet pu...,1.509992e+18,en,0.561473,0,0.0,0.0,0.0,,


In [27]:
sample.to_csv('sample_mturk.csv')

In [60]:
sample_full = pd.concat([sample, annotated_samples])

In [61]:
sample_full[sample_full.duplicated(subset='tweet')]

Unnamed: 0,index,tweet,id,lang,topic,score,argumentative,evidence,claim,procon,arg_type,evidence_type
97,3618,<MENTION> <MENTION> <MENTION> But no one of us...,1.511637e+18,en,We should stop subsidizing meat,0.671986,0,0.0,0.0,0.0,,
38,38,<MENTION> <MENTION> <MENTION> <MENTION> <MENTI...,1.511652e+18,en,Meat alternatives should be invested in,0.746783,1,1.0,1.0,1.0,implicit,fact
48,48,<MENTION> why? havent you hear about meat rais...,1.51145e+18,en,We have to increase production of meat,0.708149,1,1.0,1.0,-1.0,implicit,fact
51,51,<MENTION> Great reason to switch to vegetarian.,1.511911e+18,en,Vegetarian and vegan diets should be discouraged,0.744728,lacks context,0.0,0.0,0.0,,
57,57,<MENTION> <MENTION> <MENTION> Switch to plant ...,1.51166e+18,en,We should encourage more fruits and vegetable ...,0.633157,1,1.0,1.0,1.0,explicit,fact
70,70,<MENTION> Why are their fruits on a vegetable ...,1.51177e+18,en,We should discourage fruits and vegetable cons...,0.619539,0,0.0,0.0,0.0,,


In [62]:
# Sample bit shuffles the set
sample_full.sample(frac=1).to_csv('full_sample.csv',index=False)