In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from operator import itemgetter

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import torch

In [13]:
# load preprocessed data
year = 2019
path = "E:\\PycharmProject\\ada_project\\data\\"
data = pd.read_csv(path + 'processed-quotes-{}.csv'.format(year))
data.head()

Unnamed: 0,quoteID,quotation,speaker,qids,date,numOccurrences,domains,nationality,gender,party,id
0,2019-04-08-048753,It is immoral. It is harmful. It is hurtful.,Donald Trump,Q22686,2019-04-08 16:22:00,44,"['mercedsunstar', 'sacbee', 'mynorthwest', 'la...",United States of America,male,Republican Party,Q22686
1,2019-05-15-053302,It is important for our equine science student...,Sally Johnson,Q42336656,2019-05-15 18:03:22,1,['lanereport'],United States of America,female,,Q42336656
2,2019-02-27-055406,It is important to many Native American tribes...,Rafael Ortega,Q16672061,2019-02-27 00:00:00,1,['kstp'],Mexico,male,,Q16672061
3,2019-12-08-023053,"It is impossible, biologically, truly to `rest...",Barry Lopez,Q809063,2019-12-08 06:00:00,1,['timescolonist'],United States of America,male,,Q809063
4,2019-02-21-000088,Chilton put it on a little tape recorder and...,Sam the Sham,Q1971786,2019-02-21 11:05:34,1,['nashvillescene'],United States of America,male,,Q1971786


In [3]:
task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

labels = ['negative', 'neutral', 'positive']

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)
model = model.to("cuda")

# Return all document ids that that have cosine similarity with the query larger than a threshold
def search_vec_sklearn(query, features, threshold=0.1):
    new_features = tf.transform([query])
    cosine_similarities = linear_kernel(new_features, features).flatten()
    related_docs_indices, cos_sim_sorted = zip(*sorted(enumerate(cosine_similarities), key=itemgetter(1), 
                                                       reverse=True))
    doc_ids = []
    for i, cos_sim in enumerate(cos_sim_sorted):
        if cos_sim < threshold:
            break
        doc_ids.append(related_docs_indices[i])
    return doc_ids

In [4]:
def gender_bias_time_series(data, keywords, features, labels, year, topic):
    total_occu = []

    for keyword in keywords:
        print(keyword)
        ret_ids = search_vec_sklearn(keyword, features)
        topic_data = data.iloc[ret_ids]
        # sort by date
        topic_data = topic_data.sort_values(by='date')
        # parse to datetime
        topic_data['date'] = pd.to_datetime(topic_data['date'])
        
        quote_labels = []
        for quote in tqdm(topic_data['quotation']):
            encoded_input = tokenizer(quote, max_length=512, 
                                      return_tensors='pt', truncation=True, add_special_tokens=True).to("cuda")
            with torch.no_grad():
                output = model(**encoded_input)
            scores = output[0][0].detach().cpu().numpy()
            quote_labels.append(labels[np.argmax(scores)])
        topic_data['sentiment'] = quote_labels
        topic_occu = []
        for label in labels:
            subgroup_data = topic_data[topic_data['sentiment'] == label]
            # male
            male_data = subgroup_data[subgroup_data.gender == 'male']
            male_occu = male_data.groupby(male_data['date'].dt.to_period("M")).agg({'numOccurrences':'sum'})
            male_occu = male_occu.reset_index()
            male_occu['gender'] = 'male'
            # female
            female_data = subgroup_data[subgroup_data.gender == 'female']
            female_occu = female_data.groupby(female_data['date'].dt.to_period("M")).agg({'numOccurrences':'sum'})
            female_occu = female_occu.reset_index()
            female_occu['gender'] = 'female'
        
            subgroup_occu = pd.concat([male_occu, female_occu])
            subgroup_occu['sentiment'] = label
            topic_occu.append(subgroup_occu)
            
        topic_occu = pd.concat(topic_occu)
        topic_occu['topic'] = keyword
        total_occu.append(topic_occu)
    
    total_occu = pd.concat(total_occu)
    total_occu.to_csv(topic+"-sentiment-{}.csv".format(year))
    return total_occu

In [14]:
# all quotation sentences.
original_documents = data['quotation'].values
# Retrieval model
tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1), min_df=1, stop_words='english')
features = tf.fit_transform(original_documents)

In [None]:
topic = 'sports'
keywords = ['football', 'basketball', 'tennis', 'swimming', 'fitness']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'people'
keywords = ['donald trump', 'joe biden', 'nancy pelosi', 'elizabeth warren']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'entertainment'
keywords = ['movie', 'music', 'art', 'book', 'tv']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'gender'
keywords = ['child brides', 'sexual harassment', 'feminism', 'gender inequality', 'abortion']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'politics'
keywords = ['vote', 'election', 'government', 'party politics', 'law', 'legal']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'education'
keywords = ['school', 'education', 'student', 'teacher', 'tuition']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'health'
keywords = ['health insurance', 'disease', 'medicine', 'hospital', 'doctor', 'nurse']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'business'
keywords = ['market', 'banking', 'investment', 'stock', 'management', 'company', 'business']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)

In [None]:
topic = 'lifestyle'
keywords = ['pets', 'fashion', 'clothes', 'food', 'travel', 'shopping']
gender_bias_time_series(data, keywords=keywords, features=features, labels=labels, year=year, topic=topic)