In [2]:
import pandas as pd
dataset_file = 'comments_2022.csv'
df = pd.read_csv(dataset_file, header=0)
df.head()

Unnamed: 0,id,parent_id,author,subreddit,body
0,hqqvfvb,t1_hqqt3r3,BasicComplexities,VaushV,"\n&gt;. You use French people flippantly, but ..."
1,hqqxbu8,t1_hqqhgr2,RagingAardvark,daddit,Thank you! I was actually recently mulling ove...
2,hqqz28x,t3_rsooda,malarky-b,asianamerican,Maybe I'm just tired but I don't get the comme...
3,hqr027z,t1_hqqz64g,AdventurousAnxiety78,Afghan,"No, the coins are one of many diverse versions..."
4,hqr0j0w,t3_rt7dak,stjeana,TooAfraidToAsk,Cultural appropriation is vs and doesnt have a...


# Dataset clean step 1

In [109]:
# Sort subreddits by number of comments
sub_coms = df.groupby('subreddit')['id'].nunique().to_frame()
sub_coms.columns = ['count']
sub_coms.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
subreddit,Unnamed: 1_level_1
witchcraft,4747
AmItheAsshole,2564
TooAfraidToAsk,914
AskReddit,840
unpopularopinion,698
...,...
StonerPhilosophy,1
Stoicism,1
Steam,1
StarWarsCantina,1


In [110]:
# Sort subreddits by number of authors
sub_auth = df.groupby('subreddit')['author'].nunique().to_frame()
sub_auth.columns = ['count']
sub_auth.sort_values(by='count', ascending=False)

Unnamed: 0_level_0,count
subreddit,Unnamed: 1_level_1
AmItheAsshole,2078
TooAfraidToAsk,738
AskReddit,715
unpopularopinion,536
facepalm,382
...,...
Squidbillies,1
Squamish,1
Sprechstunde,1
SpiritAnimals,1


In [111]:
# Too much subreddits
print(f'Number of subreddits original: {len(sub_coms)}')
# Remove all subreddits that has less than 2 comments
df = df.loc[~df['subreddit'].isin(list(sub_coms[sub_coms['count']<2].index))]
print(f'All subreddits with more than 1 comment: {len(df.subreddit.unique())}')
# Remove all subreddits that has less than 2 authors
df = df.loc[~df['subreddit'].isin(list(sub_auth[sub_auth['count']<2].index))]
print(f'All subreddits with more than 2 authors: {len(df.subreddit.unique())}')
# Witchcraft has no cultural appropriation themes, maybe conflicting keywords.
df = df.loc[df['subreddit'] != 'witchcraft']
# Same for KUWTK
df = df.loc[df['subreddit'] != 'KUWTK']

len(df.subreddit.unique())

Number of subreddits original: 3027
All subreddits with more than 1 comment: 1670
All subreddits with more than 2 authors: 1616


1614

In [112]:
# Remove all comments that has less than 4 words
print(f'Number of comments: {len(df.id.unique())}')
df = df[df['body'].str.split().str.len() > 3]
print(f'Number of comments after removal of short: {len(df.id.unique())}')

Number of comments: 24945
Number of comments after removal of short: 24422


In [113]:
# Remove all lines where subreddit or author is deleted
print(f'Number of subreddits: {len(df.subreddit.unique())}')
df = df.loc[(~df['author'].str.contains('deleted')) | (~df['subreddit'].str.contains('deleted'))]
print(f'Number of subreddits after clean 1: {len(df.subreddit.unique())}')

Number of subreddits: 1613
Number of subreddits after clean 1: 1613


# Dataset enrichment with topics

In [114]:
# GPT-3 Ada model for topic extraction
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import json
import re

# Authenticate to OpenAI API
api_key = '<your-api-key-here>''
headers = {'Content-Type': 'application/json',
           'Authorization': f'Bearer {api_key}'}

session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

# Define the function to extract topics
def extract_topics(text):
    #parsed_text = text.replace('\n',' ')
    #parsed_text = re.sub(r'[^\w]', '', parsed_text)
    data = {
        "prompt":f"Comment: {text}\n\nTopics:", 
        "temperature":0.5,
        "max_tokens":60,
        "model": "text-ada-001",
        "stop": ["\n\n"]
    }
    
    # Send request
    response = session.post('https://api.openai.com/v1/completions', headers=headers, data=json.dumps(data))
    
    # Extract topics
    try:
        topics = response.json()['choices'][0]['text'].strip()
        topics = topics.split(', ')
    except:
        topics = []

    return [t.lower() for t in topics if not any(i.isdigit() for i in t)]

# Apply the function to the 'text' column of the dataframe
df['topics'] = df['body'].apply(lambda x: extract_topics(x))
#df_test = df.copy()
#df_test['topics'] = df['body'].iloc[:10].apply(lambda x: extract_topics(x))

In [115]:
df.to_csv('comments_2022_step1.csv', index=False)

# Dataset clean step 2

In [3]:
df = pd.read_csv('comments_2022_step1.csv', header=0)
df['topics'] = df['topics'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').split(', '))
df.head()

Unnamed: 0,id,parent_id,author,subreddit,body,topics
0,hqqvfvb,t1_hqqt3r3,BasicComplexities,VaushV,"\n&gt;. You use French people flippantly, but ...","[racist slurs, non-racial slurs, words, slurs]"
1,hqqxbu8,t1_hqqhgr2,RagingAardvark,daddit,Thank you! I was actually recently mulling ove...,"[encanto, diversity, cultural appropriation, w..."
2,hqqz28x,t3_rsooda,malarky-b,asianamerican,Maybe I'm just tired but I don't get the comme...,[ad]
3,hqr027z,t1_hqqz64g,AdventurousAnxiety78,Afghan,"No, the coins are one of many diverse versions...","[hazara, traditional dress, turkic people, ara..."
4,hqr0j0w,t3_rt7dak,stjeana,TooAfraidToAsk,Cultural appropriation is vs and doesnt have a...,"[cultural appropriation, cool, respect, culture]"


In [4]:
import re

# Clean topic names
df['topics'] = df['topics'].apply(lambda x: [t.replace('"', '').replace("'",'') for t in x])
# Clean other errors
df['topics'] = df['topics'].apply(lambda x: [t.replace('-',' ').replace('r/','') for t in x])
df['topics'] = df['topics'].apply(lambda x: [t for t in x if '_' not in t])
# Remove too short, too long and question topics
def check_splits(topic):
    to_return = []
    a = re.split(r' |,|\\n|\.|;', topic)
    # Don't add topics with more than 2 words
    if len(a) < 3:
        for x in a:
            if len(x) > 3:
                to_return.append(x)
    return ' '.join(to_return)
df['topics'] = df['topics'].apply(lambda x: [check_splits(t) for t in x])
# Remove all hashtags
df['topics'] = df['topics'].apply(lambda x: [t.replace('#','') for t in x])
# Remove html chars
df['topics'] = df['topics'].apply(lambda x: [t.replace('&gt;','').replace('&lt;','') for t in x])
# Remove links
df['topics'] = df['topics'].apply(lambda x: [t for t in x if "http" not in t])

In [5]:
# Remove all topics that are less than 4 letters
df['topics'] = df['topics'].apply(lambda x: [t for t in x if len(t) > 3])

In [6]:
# Remove all non english topics
import nltk
nltk.download('words')
english_words = set(nltk.corpus.words.words())
def is_english(topic):
    a = topic.split(' ')
    to_return = []
    for x in a:
        if x.lower() in english_words:
            to_return.append(x)
    return ' '.join(to_return)

df['topics'] = df['topics'].apply(lambda x: [is_english(t) for t in x])
# Remove now empty topics
df['topics'] = df['topics'].apply(lambda x: [t for t in x if len(t)>0])

[nltk_data] Error loading words: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Hostname mismatch, certificate is not valid for
[nltk_data]     'raw.githubusercontent.com'. (_ssl.c:1129)>


In [7]:
import nltk
from nltk.tokenize import word_tokenize

def is_noun(text):
    # Tokenize the text
    words = word_tokenize(text)

    # Part of speech tagging
    tagged_words = nltk.pos_tag(words)

    # Check if at least one word is a noun
    for word, tag in tagged_words:
        if tag == 'NN':
            return True

    return False

df['topics'] = df['topics'].apply(lambda x: [t for t in x if is_noun(t)])

In [8]:
#manual clean
def is_noun(text):
    if len(text.split(" ")) == 1:
        if text.endswith("ent") or text.endswith('er') or text.endswith('est') or text.endswith('ive') or text.endswith('y') or text.endswith("ful") or text.endswith('al') or text.endswith('ung') or text.endswith('th') or text.endswith('ing'):
            return False
    return True

df['topics'] = df['topics'].apply(lambda x: [t for t in x if is_noun(t)])

In [9]:
import nltk
import pandas as pd

nltk.download('averaged_perceptron_tagger')

def remove_verbs(df, column):
    for i, row in df.iterrows():
        strings = row[column]
        strings = [string for string in strings if not contains_verb(string)]
        # Update the row with the list of strings without verbs
        df.at[i, column] = strings
    return df

def contains_verb(string):
    words = nltk.word_tokenize(string)
    tagged_words = nltk.pos_tag(words)
    # Check if the string contains a verb
    return any(tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] for word, tag in tagged_words)

df = remove_verbs(df, 'topics')

[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: Hostname mismatch, certificate is not valid
[nltk_data]     for 'raw.githubusercontent.com'. (_ssl.c:1129)>


In [10]:
import nltk
from nltk.corpus import wordnet
import pandas as pd

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

def replace_adj_with_noun(df, column):
    for i, row in df.iterrows():
        strings = row[column]
        new_strings = []
        for string in strings:
            words = nltk.word_tokenize(string)
            tagged_words = nltk.pos_tag(words)
            new_string = []
            for word, tag in tagged_words:
                # Replace the adjective with its corresponding noun
                if tag == 'JJ':
                    synsets = wordnet.synsets(word, pos='a')
                    if len(synsets) > 0:
                        lemmas = synsets[0].lemmas()
                        if len(lemmas) > 0:
                            p = lemmas[0].pertainyms()
                            if len(p) > 0:
                                new_word = p[0].name()
                                new_string.append(new_word)
                                continue
                new_string.append(word)
            new_strings.append(' '.join(new_string))
        # Update the row with the list of strings with replaced adjectives
        df.at[i, column] = new_strings
    return df

df = replace_adj_with_noun(df, 'topics')

[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     Hostname mismatch, certificate is not valid for
[nltk_data]     'raw.githubusercontent.com'. (_ssl.c:1129)>
[nltk_data] Error loading averaged_perceptron_tagger: <urlopen error
[nltk_data]     [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify
[nltk_data]     failed: Hostname mismatch, certificate is not valid
[nltk_data]     for 'raw.githubusercontent.com'. (_ssl.c:1129)>


In [11]:
import pandas as pd

def uniform_strings(df, column):
    # Create a dictionary to store the mapping from words to uniformed strings
    mapping = {}
    uniformed_strings = []
    for i, row in df.iterrows():
        strings = row[column]
        for j, string in enumerate(strings):
            # Split the string into words and sort them
            words = string.split()
            words.sort()
            # Join the words back together to create a key for the mapping
            key = " ".join(words)
            # If the key is not in the mapping, add it and the original string
            if key not in mapping:
                mapping[key] = string
                uniformed_strings.append(string)
            # Replace the string with the uniformed string
            strings[j] = mapping[key]
        # Update the row with the uniformed strings
        df.at[i, column] = strings
    return df

df = uniform_strings(df, 'topics')

In [26]:
from itertools import combinations
combi = set([t for x in list(df['topics']) for t in x if len(t.split(" ")) > 1])

def check_combi(strings):
    new_list = strings
    word_combinations = list(combinations(strings, 2))
    word_combinations = [t for t in word_combinations if len(t) < 3]
    for word in word_combinations:
        if ' '.join(word) in combi:
            if word[0] in new_list:
                new_list.remove(word[0])
            if word[1] in new_list:
                new_list.remove(word[1])
            new_list.append(' '.join(word))
    return new_list

df['topics'] = df['topics'].apply(check_combi)

In [28]:
# Remove all comments without topics
print(f'Number of comments: {len(df.id.unique())}')
df = df.loc[df['topics'].apply(lambda x: len(x) > 0)]
print(f'Number of comments after cleaning: {len(df.id.unique())}')

Number of comments: 20752
Number of comments after cleaning: 20752


In [None]:
df.to_csv('comments_2022_step2.csv', index=False)

# Enrichment with sub-type

In [None]:
df = pd.read_csv('comments_2022_step2.csv', header=0)
df['topics'] = df['topics'].apply(lambda x: x.replace('[','').replace(']','').replace("'",'').split(', '))
df.head()

In [82]:
# Remove AutoModerator
df = df.loc[df['author']!='AutoModerator']

In [579]:
a_keywords = [    "symbol", "art", "flag", "history", "tattoo", "mask", "tribe", "ancestral",    "heritage",    "totem",    "headdress",    "regalia",    "jewelry",    "indigenous",    "ceremonial",    "ritual",    "sacred",    "spiritual",    "identity",    "legacy"]

m_keywords = [    "hair", "haircut", "dread", "cloth", "cornrow", "style", "pierc", "wore", "coloniz", "wear", "misappropriation",    "ethnic",    "native",    "dress",    "attire",    "outfit"]

l_keywords = [    "philosoph", "english", "hola", "language", "translate", "interpreter", "name", "linguistic",    "vernacular",    "dialect",    "accent",    "identity",    "heritage",    "expression",    "borrowing"]

s_keywords = [    "blackfac", "whitewash", "song", "sing", "movie", "caricat", "equality", "character", "racis","gatekeep", "born", "stereotyping",    "misrepresentation",    "stereotype",    "portrayal",    "sensitivity",    "awareness",    "diversity",    "respect",    "tolerance"]

i_keywords = [    "ramen", "jew", "pagan", "pizza", "preach", "gospel", "coffee", "book","food", "christ", "cook", "recipe", "siddur", "pray", "myth", "bow", "religion", "insensitivity",    "religious",    "practices",    "respect",    "tolerance",    "identity",    "heritage",    "expression"]

e_keywords = [   "compan", "anime", "commerc", "rap", "exploit", "sexuali", "music", "exploitation",    "commodification",    "profit",    "commercialization",    "tourism",    "revenue"]

In [580]:
def get_subtypes(comment):
    sub_types = []
    for word in a_keywords:
        if word in comment.lower():
            sub_types.append('A')
    for word in m_keywords:
        if word in comment.lower():
            sub_types.append('M')
    for word in l_keywords:
        if word in comment.lower():
            sub_types.append('L')
    for word in s_keywords:
        if word in comment.lower():
            sub_types.append('S')
    for word in i_keywords:
        if word in comment.lower():
            sub_types.append('I')
    for word in e_keywords:
        if word in comment.lower():
            sub_types.append('E')
    return list(set(sub_types))

df['sub-types'] = df['body'].apply(get_subtypes)
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sub-types'] = df['body'].apply(get_subtypes)


Unnamed: 0,id,parent_id,author,subreddit,body,topics,sub-types
0,hqqvfvb,t1_hqqt3r3,BasicComplexities,VaushV,"\n&gt;. You use French people flippantly, but ...",[racist],"[L, S, M, A]"
1,hqqxbu8,t1_hqqhgr2,RagingAardvark,daddit,Thank you! I was actually recently mulling ove...,"[culture appropriation, white supremacy, culture]","[L, S, E, M]"
3,hqr027z,t1_hqqz64g,AdventurousAnxiety78,Afghan,"No, the coins are one of many diverse versions...",[traditional dress],"[M, A]"
4,hqr0j0w,t3_rt7dak,stjeana,TooAfraidToAsk,Cultural appropriation is vs and doesnt have a...,"[culture appropriation, cool, respect, culture]","[I, S]"
5,hqr0ska,t3_rt0uqs,kbell2020,Dreadlocks,"Hi,\n\nWhite mom here, straight hair. My son i...",[culture appropriation],[M]
...,...,...,...,...,...,...,...
24417,i6uidyg,t3_ufmjfc,Fafgarth,TooAfraidToAsk,"it's quite easy : \n\nIf you are white, it's A...",[appreciation],[]
24418,i6uiqe4,t1_i6sy5fh,datnoob9113,PoliticalCompassMemes,"You say this but the view ""cultural appropriat...","[culture appropriation, view, term]",[]
24419,i6uisj4,t1_i6uhf2c,KingCrow27,TooAfraidToAsk,Because those are literally the types of peopl...,[culture appropriation],"[L, I, A]"
24420,i6uiv27,t3_ufmjfc,Catbunny123,TooAfraidToAsk,Lol how is Korean beauty appropriative? It's j...,"[cool, fashion, fashion designer, fashion indu...","[S, E, M, I, A]"


In [581]:
df.loc[df['sub-types'].map(len)==0]

Unnamed: 0,id,parent_id,author,subreddit,body,topics,sub-types
8,hqr659v,t3_rt8733,JustMeHere8431,ireland,"Cultural appropriation, we need to cancel him 😂😂","[culture appropriation, rework]",[]
14,hqrk3bg,t1_hqrjvzf,OkScheme625,Genshin_Impact,what are you even talking about the reason nik...,[culture],[]
17,hqrnwut,t1_hqom3d4,e-s-p,AskMen,The internet has given us a disconnect from th...,"[information, culture appropriation]",[]
24,hqryx9u,t3_rtcczg,Jules8432,toronto,Fireworks are cultural appropriation from Chin...,[china],[]
27,hqs5d9x,t3_rtd6hj,Competitive-Comb4574,TrueOffMyChest,'cultural appropriation' is a ridiculous concept.,[culture appropriation],[]
...,...,...,...,...,...,...,...
24404,i6ufir2,t3_ufggrn,KungThulhu,TooAfraidToAsk,cultural appropriation is a twitter thing and ...,[culture appropriation],[]
24405,i6ufl8n,t3_ufggrn,TooBusySaltMining,TooAfraidToAsk,"It's a small, very vocal and very stupid minor...","[style, music, culture fashion, culture fashion]",[]
24408,i6ugbqy,t3_ufggrn,anonym_ami,TooAfraidToAsk,America wouldn’t be the melting pot that it is...,"[culture appropriation, immigration, immigration]",[]
24417,i6uidyg,t3_ufmjfc,Fafgarth,TooAfraidToAsk,"it's quite easy : \n\nIf you are white, it's A...",[appreciation],[]


In [588]:
df = df.loc[df['subreddit']!='[deleted]']
df = df.loc[df['author']!='[deleted]']

In [586]:
df.to_csv('comments_2022_step3.csv', index=False)