In [None]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
import re
import string
import os
import pandas as pd

# Dataset 1a: German → English Argumentation Reviews

1. Train a German GloVE model on these embeddings from column 'review'
2. WEAT Analysis
3. Word Co-occurence analysis

In [None]:
german_stop_words = stopwords.words('german')
eng_stop_words = stopwords.words('english')

In [None]:
def stop_word_removal(x):
    token = x.split()
    cleaned = ' '.join([w for w in token if not w in german_stop_words])
    return cleaned.replace('.', '\n')

def eng_stop_word_removal(x):
    token = x.split()
    cleaned = ' '.join([w for w in token if not w in eng_stop_words])
    return cleaned.replace('.', '\n')

def clean_text(text):
    # remove numbers
    text_nonum = re.sub(r'\d+', '', text)
    # remove punctuations and convert characters to lower case
    text_nopunct = "".join([char.lower() for char in text_nonum]) 
    # substitute multiple whitespace with single whitespace
    # Also, removes leading and trailing whitespaces
    text_no_doublespace = re.sub('\s+', ' ', text_nopunct).strip()
    return text_no_doublespace


In [None]:
data_path = 'background/data/'
all_files = [i for i in os.listdir('background/data/') if i.split('_')[-1] >= '2017']
q_constructive = []
q_critical = []
q_quality = []
q_helpful = []
gender = []
age = []
reviewer_id = []
reviewee_id = []
for file in all_files:
    year = float(file.split('_')[-1].split('.')[0])
    data_file = pd.read_csv(data_path + file)
    gender.extend(data_file['D2'])
    age.extend(year - data_file['D1'])
    reviewer_id.extend(data_file['review'])
    reviewee_id.extend(data_file['revieweeIDnum'])
    q_constructive.extend(data_file['Q_constructive_suggestions'])
    q_critical.extend(data_file['Q_critical_aspects'])
    q_quality.extend(data_file['Q_high_quality'])
    q_helpful.extend(data_file['Q_was_helpful'])

In [None]:
gender_male = np.array(gender) == 'm'
gender_female = np.array(gender) == 'f'

sum(gender_female)/(sum(gender_male) + sum(gender_female))

In [None]:
age = np.array(age)
age = age[~np.isnan(age)]
np.std(age)

In [None]:
np.mean(review_lens)

In [None]:
high_quality_mask = np.array(q_quality) > 5
high_critical_mask = np.array(q_critical) > 5
high_constructive_mask = np.array(q_constructive) > 5
high_help_mask = np.array(q_helpful) > 5
gender_male = np.array(gender) == 'm'
gender_female = np.array(gender) == 'f'

def save_to_csv(arr, topic):
    pd.Series(arr).to_csv(topic+'.csv', index=False)

save_to_csv(gender_male, 'b_gender_male')
save_to_csv(gender_female, 'b_gender_female')
save_to_csv(high_quality_mask, 'b_quality')
save_to_csv(high_critical_mask, 'b_critical')
save_to_csv(high_constructive_mask, 'b_constructive')
save_to_csv(high_help_mask, 'b_helpfulness')

In [None]:
corpus_quality = corpus_reviews[np.array(q_quality) > 5]
corpus_critical = corpus_reviews[np.array(q_critical) > 5]
corpus_constructive = corpus_reviews[np.array(q_constructive) > 5]
corpus_helpful = corpus_reviews[np.array(q_helpful) > 5]
corpus_male = corpus_reviews[gender_male]
corpus_female = corpus_reviews[gender_female]

In [None]:
def clean_corpus(corpus_reviews, filename='corpus_reviews_glove.txt'):
    corpus_reviews = corpus_reviews.apply(stop_word_removal)
    corpus_reviews = corpus_reviews.apply(clean_text)
    full_review = '\n'.join(corpus_reviews)
    with open(filename, "w") as text_file:
        text_file.write(full_review)

clean_corpus(corpus_male, 'male_corpus.txt')
clean_corpus(corpus_female, 'female_corpus.txt')
clean_corpus(corpus_constructive, 'low_constructive_corpus.txt')
clean_corpus(corpus_helpful, 'low_helpful_corpus.txt')

# Generate GloVE models

In [None]:
%cd GloVE
!bash GloVe_training.sh

# WEAT Co-occurence Analysis

In [None]:
!python -m spacy download de_core_news_sm

In [None]:
!python weat_cooccurrence_analysis_german.py \
    --data "reviews_glove.txt" \
    --output "output/weat_cooccurrence_analysis" \
    --processing_cores 7 \
    --tests 1 2 3 4 5 6 7 8 9 10

In [None]:
!python accumulate_cooccurrence_counts.py \
    --input "output/weat_cooccurrence_analysis/weat-cooccurrence-analysis_results-20220419163143.json" \
    --weat_lexicons "sbeval/tests/weat_tests_german.json"

In [None]:
!bash convert_glove_to_word2vec_format.sh

In [None]:
import os
vectors = [r for r in os.listdir('GloVE/glove_models_trained/') if '_vectors' in r]
vectors = ['full_corpus_vectors.txt']

In [None]:
for v in vectors:
    command = 'python3 embedding_bias_evaluation.py \
        --embedding_model "GloVe/glove_models_trained/"' + v + ' --output "output/embedding_model_evaluation" \
        --lowercase'
    os.system(command)