In [3]:
import pandas as pd
import re

# Read the file
df = pd.read_csv('gender_data.csv', sep=',', encoding='utf-8')
label = df['gender'].to_list()
text = df['description'].to_list()

# Select only the label and text data to make a dataframe
# Also, remove unnecessary labels other than males and females
data = pd.DataFrame(label,columns=['label'])
data['text'] = text
mask_1 = data['label'].isin(['brand','unknown'])
data = data[~mask_1].reset_index(drop=True)
data = data.dropna() 

In [4]:
# Change the labels as 'm' and 'f' for male and female

def change_label(label):
    if label == 'male':
        label = re.sub('male','m',label)
    elif label == 'female':
        label = re.sub('female','f',label)
    return label

data['label'] = data['label'].apply(lambda x: change_label(x))

In [5]:
# delete the emoji in the text

def remove_emoji(text):
    reg_pattern = re.compile(pattern="["u"\U0001F600-\U0001F64F"  # emoticons
                                     u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                     u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                     u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                     u"\U00002702-\U000027B0"
                                     u"\U000024C2-\U0001F251""]+", flags=re.UNICODE)
    return reg_pattern.sub(r'', text)

data['text'] = data['text'].apply(lambda x: remove_emoji(x))

In [13]:
# remove HTML entities &gt &lt &amp
data['text'] = data['text'].apply(lambda x: x.replace("&gt;", "").replace("&lt;", "").replace("&amp;", ""))

# remove URL and @USER tokens
data['text'] = data['text'].apply(lambda x: x.replace("url", "").replace("@user", ""))

# remove special characters including hashtags, remove punctuation, numbers
data['text'] = data['text'].apply(lambda x: re.sub(r'[?$%&()\-_=+\]\[{}!<>^".,;@#\d]', '', x))

# remove multiple spaces
data['text'] = data['text'].apply(lambda x: re.sub(r'\s+', ' ', x))

# delete some data which has no text
mask_2 = data['text'].isin(['', ' '])
data = data[~mask_2].reset_index(drop=True)
data = data.dropna() 

In [14]:
data.to_csv('ext_data.csv',mode='w',sep='\t',index=False)