## Notebook Containing Text Pre-Processing Pipelines
### ACL22 SRW confidential submission
---

In [1]:
import numpy as np
import pandas as pd
from argparse import ArgumentParser
import importlib
import glob
import re
import os

import nltk 
nltk.download('words')
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from helpers import remove_emojis


[nltk_data] Downloading package words to
[nltk_data]     /Users/danielfurman/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielfurman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# authorship attribution internal vars

experiment_name = 'v0' #folder name for saving new outputs
chars_valid = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p""q","r","s","t","u","v","w","x","y","z",
               "A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
               "1","2","3","4","5","6","7","8","9","0","@","#","<",">"]

write_data_dir = 'acl22-data/intermediate-data/cleaned-data-authorship-attribution/'+experiment_name
if not os.path.isdir(write_data_dir):
    os.mkdir(write_data_dir)

In [3]:
"""
1. authorship attribution twitter
"""

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-attribution/*-twitter.csv')
len(file_list)
print('\n\n', file_list, '\n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')
print('\n\n', pd_df_list[1].sample(2), '\n\n')
print('\n\n', pd_df_list[2].sample(2), '\n\n')

# main pipeline twitter data
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62*2:
        for j in range(0,61*2):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61*2] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group into one pandas dataframe
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i]
    else:
        pd_df = pd.concat([pd_df, pd_df_list[i]])

pd_df.columns = ['handle', 'post']

pd_df['target'] = pd_df['handle']
pd_df['target'][pd_df['handle']=='KimKardashian'] = 0
pd_df['target'][pd_df['handle']=='KendallJenner'] = 1
pd_df['target'][pd_df['handle']=='KylieJenner'] = 2

pd_df[['post', 'target', 'handle']].to_csv(write_data_dir+'/twitter_celeb_attribution.csv')
pd_df[['post', 'target', 'handle']].tail(5)

# pipeline for mixing twitter data (half the size)
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group into one pandas dataframe
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i]
    else:
        pd_df = pd.concat([pd_df, pd_df_list[i]])

pd_df.columns = ['handle', 'post']

pd_df['target'] = pd_df['handle']
pd_df['target'][pd_df['handle']=='KimKardashian'] = 0
pd_df['target'][pd_df['handle']=='KendallJenner'] = 1
pd_df['target'][pd_df['handle']=='KylieJenner'] = 2

pd_df[['post', 'target', 'handle']].to_csv(write_data_dir+'/twitter_mixing_celeb_attribution.csv')
pd_df[['post', 'target', 'handle']].tail(5)



 ['acl22-data/raw-data/raw-data-authorship-attribution/KendallJenner-raw-scraped-twitter.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KimKardashian-raw-scraped-twitter.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KylieJenner-raw-scraped-twitter.csv'] 




                  0                                                  1
151  KendallJenner  NEW @Versace CAMPAIGN BY MERT&amp;MARCUS https...
114  KendallJenner  wearing our KENDALL X KYLIE makeup collab and ... 




                  0                                                  1
25   KimKardashian  💞 Pinkalicious Balenciaga 💞 https://t.co/mbBng...
107  KimKardashian  The final decision will be up to Gov. Kevin St... 




              0                                                  1
5  KylieJenner  one hour to go!!! https://t.co/bDaiohhXCV http...
7  KylieJenner  restocking my @kyliebaby sets, bundles and moi... 




Unnamed: 0,post,target,handle
57,leo season loading <url>,2,KylieJenner
58,theres a lot of eole trying to get on the site...,2,KylieJenner
59,the last eisode of INSIDE KYLIE COSMETICS just...,2,KylieJenner
60,1 hour to go <url> @<user> 9am PST,2,KylieJenner
61,KYLIE COSMETICS OFFICIALLY RELAUNCHES TOMORROW...,2,KylieJenner


In [4]:
"""
2. authorship attribution instagram
"""

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-attribution/*-instagram.csv')
len(file_list)
print('\n\n', file_list, '\n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')
print('\n\n', pd_df_list[1].sample(2), '\n\n')
print('\n\n', pd_df_list[2].sample(2), '\n\n')

# main pipeline
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group into one pandas dataframe
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i]
    else:
        pd_df = pd.concat([pd_df, pd_df_list[i]])

pd_df.columns = ['handle', 'post']

pd_df['target'] = pd_df['handle']
pd_df['target'][pd_df['handle']=='KimKardashian'] = 0
pd_df['target'][pd_df['handle']=='KendallJenner'] = 1
pd_df['target'][pd_df['handle']=='KylieJenner'] = 2

pd_df[['post', 'target', 'handle']].to_csv(write_data_dir+'/instagram_celeb_attribution.csv')
pd_df[['post', 'target', 'handle']].sample(5)



 ['acl22-data/raw-data/raw-data-authorship-attribution/KylieJenner-raw-scraped-instagram.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KendallJenner-raw-scraped-instagram.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KimKardashian-raw-scraped-instagram.csv'] 




               0          1
46  KylieJenner    XXIII 🖤
20  KylieJenner  you call? 




                0                                           1
2  KendallJenner  thank you for all the birthday wishes ❤️‍🔥
1  KendallJenner                                         🤍🤍🤍 




                 0                                                  1
55  KimKardashian  My baby boy Psalm is the sweetest! He just sta...
2   KimKardashian                                     Heart and Soul 




Unnamed: 0,post,target,handle
35,Im so excited that @<user> is now available at...,2,KylieJenner
31,These two are besties True and Psalm,0,KimKardashian
34,@<user> in SOHO by @<user>,1,KendallJenner
5,Thank you so much to the @<user> for the honor...,0,KimKardashian
57,FAM,0,KimKardashian


In [5]:
"""
3. authorship attribution facebook
"""

#still need to do

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-attribution/*-facebook.csv')
len(file_list)
print('\n\n', file_list, '\n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')
print('\n\n', pd_df_list[1].sample(2), '\n\n')
print('\n\n', pd_df_list[2].sample(2), '\n\n')

# main pipeline
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group into one pandas dataframe
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i]
    else:
        pd_df = pd.concat([pd_df, pd_df_list[i]])

pd_df.columns = ['handle', 'post']

pd_df['target'] = pd_df['handle']
pd_df['target'][pd_df['handle']=='Kim Kardashian'] = 0
pd_df['target'][pd_df['handle']=='Kendall Jenner'] = 1
pd_df['target'][pd_df['handle']=='KylieJenner'] = 2

pd_df[['post', 'target', 'handle']].to_csv(write_data_dir+'/facebook_celeb_attribution.csv')
pd_df[['post', 'target', 'handle']].sample(5)



 ['acl22-data/raw-data/raw-data-authorship-attribution/KimKardashian-raw-scraped-facebook.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KendallJenner-raw-scraped-facebook.csv', 'acl22-data/raw-data/raw-data-authorship-attribution/KylieJenner-raw-scraped-facebook.csv'] 




                  0                                                  1
2   Kim Kardashian                    That’s what friends are for!!!!
14  Kim Kardashian  Our Star Search audition! Clearly we didn’t ge... 




                  0                                                  1
10  Kendall Jenner                  i thrive at this time of the year
16  Kendall Jenner  A lil Moon Oral Care to brighten your day 🖤 #m... 




               0                                                  1
11  KylieJenner  KYLIE X NIGHTMARE ON ELM STREET collection lau...
22  KylieJenner  KYLIE BABY vegan and clean baby care ☁️ 9.28 K... 




Unnamed: 0,post,target,handle
4,thank you for all the birthday wishes,1,Kendall Jenner
21,as a lover of fashion and having been incredib...,1,Kendall Jenner
19,They can steal your recie but the sauce wont t...,0,Kim Kardashian
3,in full mommy mode this halloween i hoe everyo...,2,KylieJenner
13,Thank you Adam Luck Kelly Doyle and Larry Morr...,0,Kim Kardashian


In [6]:
"""
4. authorship attribution merging IG and Twitter
"""

insta_final = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/'+experiment_name+'/instagram_celeb_attribution.csv')
insta_final.drop(['Unnamed: 0'], axis=1, inplace=True)

twitter_final = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-attribution/'+experiment_name+'/twitter_mixing_celeb_attribution.csv')
twitter_final.drop(['Unnamed: 0'], axis=1, inplace=True)

mixed_list = []
for i in range(0,len(insta_final)):
    mixed_list.append(pd.DataFrame(insta_final.loc[i]).T)
for i in range(0,len(twitter_final)):
    mixed_list.append(pd.DataFrame(twitter_final.loc[i]).T)
    

mixed = pd.concat(mixed_list)
mixed.sort_values('target', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last', ignore_index=False, key=None)
mixed = mixed.reset_index(drop=True)
mixed.to_csv(write_data_dir+'/mixed_celeb_attribution.csv')
mixed.sample(5)


Unnamed: 0,post,target,handle
205,NEW @<user> CAMPAIGN BY @<user> @<user>,1,KendallJenner
95,Words cant describe my love for you OMG I love...,0,KimKardashian
106,Halloween 2021 CowBot Costume by Manfred Thier...,0,KimKardashian
337,@<user>,2,KylieJenner
189,Pyro,1,KendallJenner


In [7]:
# authorship profiling internal vars

experiment_name = 'v0' #folder name for saving new outputs
chars_valid = ["a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p""q","r","s","t","u","v","w","x","y","z",
               "A","B","C","D","E","F","G","H","I","J","K","L","M","N","O","P","Q","R","S","T","U","V","W","X","Y","Z",
               "1","2","3","4","5","6","7","8","9","0","@","#","<",">"]

write_data_dir = 'acl22-data/intermediate-data/cleaned-data-authorship-profiling/'+experiment_name
if not os.path.isdir(write_data_dir):
    os.mkdir(write_data_dir)

In [8]:

"""
5. authorship profiling twitter
"""

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-profiling/*-twitter.csv')
print('\n\n', 'There are', len(file_list), 'twitter accounts for profiling. \n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')


# main pipeline twitter data
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62*2:
        for j in range(0,61*2):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61*2] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group and add <sep>
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
    else:
        test = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
        pd_df = pd.concat([pd_df, test])

pd_df.columns = ['Twitter_handle', 'tweet']

sex_df = pd.read_csv('acl22-data/raw-data/raw-data-authorship-profiling/gender_labels.csv')
sex_df.columns = ['Gender', 'Twitter_handle', 'Facebook_name', 'Insta_handle', 'Insta Id']
total_df = pd.merge(pd_df.assign(Twitter_handle=pd_df['Twitter_handle'].str.lower()), sex_df.assign(Twitter_handle=sex_df['Twitter_handle'].str.lower()), how='left', on='Twitter_handle')

total_df = total_df[['tweet', 'Gender']]
total_df.columns = ['post', 'target']
total_df.to_csv(write_data_dir+'/twitter_celeb_profiling.csv')
total_df.tail(5)


# pipeline for mixing twitter data (half the size)
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group and add <sep>
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
    else:
        test = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
        pd_df = pd.concat([pd_df, test])

pd_df.columns = ['Twitter_handle', 'tweet']

sex_df = pd.read_csv('acl22-data/raw-data/raw-data-authorship-profiling/gender_labels.csv')
sex_df.columns = ['Gender', 'Twitter_handle', 'Facebook_name', 'Insta_handle', 'Insta Id']
total_df = pd.merge(pd_df.assign(Twitter_handle=pd_df['Twitter_handle'].str.lower()), sex_df.assign(Twitter_handle=sex_df['Twitter_handle'].str.lower()), how='left', on='Twitter_handle')

total_df = total_df[['tweet', 'Gender']]
total_df.columns = ['post', 'target']
total_df.to_csv(write_data_dir+'/twitter_mixing_celeb_profiling.csv')
total_df.tail(5)



 There are 186 twitter accounts for profiling. 




               0                                                  1
45  ConanOBrien  I'm worried I offended my Uber driver because ...
0   ConanOBrien  Apologies for sneezing during this interview, ... 




Unnamed: 0,post,target
181,Boom It haened Im on @<user> AND IM LIVE FOR A...,M
182,Direk @<user> natouch ako sa ost mo Naakasaya ...,F
183,These are BEAUTIFULLY GORGEOUS man <url><sep>....,M
184,If I kee going to the farm there wont be any s...,M
185,@<user> Right<sep>. Its cool that Im crying # ...,F


In [9]:

"""
6. authorship profiling instagram
"""

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-profiling/*-instagram.csv')
len(file_list)
print('\n\n', 'There are', len(file_list), 'twitter accounts for profiling. \n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')
print('\n\n', pd_df_list[1].sample(2), '\n\n')
print('\n\n', pd_df_list[2].sample(2), '\n\n')

# main pipeline
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group and add <sep>
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
    else:
        test = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
        pd_df = pd.concat([pd_df, test])

pd_df.columns = ['Insta_handle', 'IG caption']
pd_df = pd_df[pd_df['Insta_handle'] != '0.0']


sex_df = pd.read_csv('acl22-data/raw-data/raw-data-authorship-profiling/gender_labels.csv')
sex_df.columns = ['Gender', 'Twitter_handle', 'Facebook_name', 'Insta_handle', 'Insta Id']
total_df = pd.merge(pd_df.assign(Insta_handle=pd_df['Insta_handle'].str.lower()), sex_df.assign(Insta_handle=sex_df['Insta_handle'].str.lower()), how='left', on='Insta_handle')

total_df = total_df[['IG caption', 'Gender']]
total_df.columns = ['post', 'target']
total_df.to_csv(write_data_dir+'/instagram_celeb_profiling.csv')
total_df.tail(5)






 There are 186 twitter accounts for profiling. 




               0                                                  1
43  anupampkher  “शिक्षक और सड़क दोनों एक जैसे होते हैं..BREAKख...
41  anupampkher  Met my assistant #Dattu after almost 6 months,... 




             0                                                  1
10  liampayne                                   Changing seasons
4   liampayne  Loving the new @hugo_official work out gear. H... 




                   0                                                  1
30  d_degeaofficial                                                  💬
41  d_degeaofficial  One of the most special days of the year ❤ @ma... 




Unnamed: 0,post,target
181,Hay birthday to the brightest most beautiful s...,M
182,Good God I love my fans so much its crazy I th...,F
183,<break>Head to toe @<user> @<user> <break><bre...,F
184,Hay Saturday<sep>. Hay birthday to this amazin...,F
185,If you dont like what Im saying kee watching g...,F


In [10]:
"""
7. authorship profiling facebook
"""

#still need to do

file_list = glob.glob('acl22-data/raw-data/raw-data-authorship-profiling/*-facebook.csv')
len(file_list)
print('\n\n', 'There are', len(file_list), 'twitter accounts for profiling. \n\n')
pd_df_list = []
for f in file_list:
    pd_df_list.append(pd.read_csv(f, header=None))

print('\n\n', pd_df_list[0].sample(2), '\n\n')
print('\n\n', pd_df_list[1].sample(2), '\n\n')
print('\n\n', pd_df_list[2].sample(2), '\n\n')

# main pipeline
for i in range(0, len(pd_df_list)):

    # if there are at least 62 lines:
    if len(pd_df_list[i][1])>=62:
        for j in range(0,61):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("BREAK", "<break>")

            
            sent = sent.replace("\n", "<break>")
            pd_df_list[i][1].loc[j] = sent
        pd_df_list[i] = pd_df_list[i].loc[0:61] 

    # else there are less than 62 lines
    else:
        for j in range(0,len(pd_df_list[i][1])):
            string=str(pd_df_list[i][1].loc[j])

            # remove emojis 
            string = remove_emojis(string)

            string_list = string.split(' ')
            itr = 0
            for word in string_list:
                if len(word) > 0:

                    #replacing <user>
                    if word[0] == "@":
                        string_list[itr] = '@<user>'

                    #replacing <url>
                    elif word[0:5] == 'https':
                        string_list[itr] = '<url>'
            
                itr+=1

            string = " ".join(string_list)

            #replacing invalid chars
            for word in string_list:
                if len(word) > 0:
                    for ii in range(0, len(word)):
                        if word[ii] not in chars_valid:
                            string = string.replace(word[ii],'')
                            
            # tokenize
            words_tok = word_tokenize(string)
            sent = " ".join(words_tok)
            sent = sent.replace('@ < user >', '@<user>')
            sent = sent.replace('< url >', '<url>')

            # replacing multiple spaces
            sent = sent.replace('  ', ' ')
            sent = sent.replace('   ', ' ')
            sent = sent.replace('    ', ' ')
            sent = sent.replace('     ', ' ')
            sent = sent.replace('      ', ' ')

            # replace breaks
            sent = sent.replace("break", "<break>")
            sent = sent.replace("\n", "<break>")
            sent = sent.replace("BREAK", "<break>")
            
            pd_df_list[i][1].loc[j] = sent

        pd_df_list[i] = pd_df_list[i].loc[0:len(pd_df_list[i][1])]

# group and add <sep>
for i in range(0, len(pd_df_list)):
    if i == 0:
        pd_df = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
    else:
        test = pd_df_list[i].groupby([0])[1].apply('<sep>. '.join).reset_index()
        pd_df = pd.concat([pd_df, test])

pd_df.columns = ['handle', 'post']
pd_df = pd_df[['post', 'handle']]
pd_df.to_csv(write_data_dir+'/facebook_celeb_profiling.csv')
pd_df.tail(5)





 There are 50 twitter accounts for profiling. 




                     0                                                  1
14  Leonardo DiCaprio  Protecting 30 percent of the planet in the mos...
7   Leonardo DiCaprio  "We will not be silent. We will never be silen... 




           0                                                  1
8  joejonas  Phoenix was special. Thank you all for a great...
9  joejonas  Down to the last 2 shows of the #RememberThisT... 




                  0                                                  1
1  Cara Delevingne  TUNE IN NOW for the STANDING IN SOLIDARITY pan...
8  Cara Delevingne  Just two hours until I go live with @MyEcoReso... 




Unnamed: 0,post,handle
0,This afternoon I gratefully received my second...,Ivanka Trump
0,Here are some classic British recies erfect fo...,Gordon Ramsay
0,Good morning hay Monday namaskar The BJP is a ...,Rajdeep Sardesai
0,Travelling with aa after long time<sep>. At an...,Saina Nehwal
0,Oh na na Myke Towers x me x Tainy out now <url...,Camilo Cabello


In [11]:
"""
8. authorship profiling merging IG and Twitter
"""

insta_final = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-profiling/'+experiment_name+'/instagram_celeb_profiling.csv')
insta_final.drop(['Unnamed: 0'], axis=1, inplace=True)

twitter_final = pd.read_csv('acl22-data/intermediate-data/cleaned-data-authorship-profiling/'+experiment_name+'/twitter_mixing_celeb_profiling.csv')
twitter_final.drop(['Unnamed: 0'], axis=1, inplace=True)

mixed_list = []
for i in range(0,len(insta_final)):
    mixed_list.append(pd.DataFrame(insta_final.loc[i]).T)
for i in range(0,len(twitter_final)):
    mixed_list.append(pd.DataFrame(twitter_final.loc[i]).T)

mixed = pd.concat(mixed_list)
mixed.sort_values('target', axis=0, ascending=True, inplace=True, kind='quicksort', na_position='last', ignore_index=False, key=None)
mixed = mixed.reset_index(drop=True)
mixed.to_csv(write_data_dir+'/mixed_celeb_profiling.csv')
mixed.sample(5)

Unnamed: 0,post,target
307,Mi es tu casa<sep>. Puerto Rico acaba de ganar...,M
158,If we ut the # Gosel at the centre and bear wi...,M
90,So excited that our book suorting @<user> is n...,F
65,<url><sep>. Back on set # RamSetu <url><sep>. ...,F
351,Circles January 17<sep>. New York Mag Link in ...,M
