In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from google.colab import drive
import os
import argparse
import logging
import os
import sys
import glob
import codecs
import re
import itertools

# Load data
drive.mount('/content/gdrive', force_remount=True)
data_dir = '../content/gdrive/MyDrive/544/data'
data_path = os.path.join(data_dir, "training.1600000.processed.noemoticon.csv")
data = pd.read_csv(data_path, encoding="ISO-8859-1",names=["target", "ids", "date", "flag", "user", "text"])

Mounted at /content/gdrive


In [2]:
# Functions

# Source:
# https://github.com/theocjr/social-media-forensics/blob/master/microblog_authorship_attribution/dataset_pre_processing/tagging_irrelevant_data.py

def tag_url(text):
    return re.sub('((([A-Za-z]{3,9}:(?:\/\/)?)(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+|(?:www\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)((?:\/[\+~%\/\.\w\-_]*)?\??(?:[\-\+=&;%@\.\w_]*)#?(?:[\.\!\/\\\w]*))?)', u'URL', text)

def tag_userref(text):
    return re.sub('@[^\s]+', u'REF', text)

def tag_hashtag(text):
    return re.sub('#[a-zA-Z]+', u'TAG', text)

def tag_date(text):
  return re.sub('[0-9]?[0-9][-/][0-9]?[0-9]([-/][0-9][0-9][0-9][0-9])?', u'DAT', text)

def tag_time(text):
    return re.sub('[0-9]?[0-9]:[0-9]?[0-9](:[0-9]?[0-9])?', u'TIM', text)

def tag_number(text):
    return re.sub('[0-9]+', u'NUM', text)

In [3]:
# Remove retweets
def is_retweet(x):
  return x.startswith('Retweeting @') or ("[Retweet]" in x) or x.startswith("Retweet from")

data_is_retweet = data.text.apply(is_retweet)
print(f"Retweets: {data_is_retweet.sum()} / {len(data)}")
data = data.loc[~data_is_retweet]
print(f"Number: {len(data)}")

Retweets: 40 / 1600000
Number: 1599960


In [4]:
# Remove tweets with few words
min_words = 4
data["n_words"] = data.text.apply(lambda x: len([a for a in x.split(' ') if len(a) > 2]))
data = data.loc[data.n_words >= min_words]
print(f"Number: {len(data)}")

Number: 1453257


In [5]:
# Add tags

def add_tags(s):
  s = tag_url(s)
  s = tag_userref(s)
  s = tag_hashtag(s)
  s = tag_date(s)
  s = tag_number(s)
  s = tag_time(s)
  return s

data["text"] = data["text"].apply(add_tags)

In [6]:
data.loc[data.user == 'CarrieStephens']

Unnamed: 0,target,ids,date,flag,user,text,n_words
625291,0,2230195518,Thu Jun 18 16:46:20 PDT 2009,NO_QUERY,CarrieStephens,REF aURL girl ear infections are so painful,7
652022,0,2238080907,Fri Jun 19 06:47:40 PDT 2009,NO_QUERY,CarrieStephens,REF That is a bummer I guess i should be m...,16


In [7]:
len(data)

1453257

In [8]:
data.loc[data.user == 'Dogbook']

Unnamed: 0,target,ids,date,flag,user,text,n_words
95,0,1467836448,Mon Apr 06 22:26:27 PDT 2009,NO_QUERY,Dogbook,Strider is a sick little puppy URL,5
106,0,1467837762,Mon Apr 06 22:26:48 PDT 2009,NO_QUERY,Dogbook,Emily will be glad when Mommy is done training...,14
1082,0,1468076498,Mon Apr 06 23:35:06 PDT 2009,NO_QUERY,Dogbook,Jade is looking for a new home... URL,6
5072,0,1468984010,Tue Apr 07 04:42:33 PDT 2009,NO_QUERY,Dogbook,Ripley is missing Sarah URL,4
5518,0,1469107358,Tue Apr 07 05:13:52 PDT 2009,NO_QUERY,Dogbook,Oscar is getting ready to be neutered URL,5
...,...,...,...,...,...,...,...
1297646,4,2004622725,Tue Jun 02 08:29:20 PDT 2009,NO_QUERY,Dogbook,Athena mu is home alone and loving it!!! URL,7
1298931,4,2005512870,Tue Jun 02 09:48:13 PDT 2009,NO_QUERY,Dogbook,Max is eating his tea URL,5
1331127,4,2015952164,Wed Jun 03 05:52:24 PDT 2009,NO_QUERY,Dogbook,Buddy is off and running with Maggie URL,7
1333608,4,2016477081,Wed Jun 03 06:50:45 PDT 2009,NO_QUERY,Dogbook,Barney is having a lazy morning with Tania! URL,7


In [9]:
# Removing users who have very specific styles
data.drop(data[(data['user'].isin(["KevinEdwardsJr", "lost_dog", "webwoke", "what_bugs_u", "wowlew"]))].index , inplace=True)

In [10]:
data.loc[data.user == 'lost_dog']

Unnamed: 0,target,ids,date,flag,user,text,n_words


In [11]:
print(data.user.value_counts()[:50])

VioletsCRUK        272
SallytheShizzle    261
mcraddictal        247
tsarnick           245
SongoftheOss       214
shanajaca          210
DarkPiano          208
keza34             207
nuttychris         202
thisgoeshere       200
StDAY              197
ramdomthoughts     194
Jayme1988          191
felicityfuller     190
Dogbook            188
Spidersamm         187
Karen230683        184
Dutchrudder        177
JessMcFlyxxx       177
enamoredsoul       174
Quimo              173
torilovesbradie    172
MTVnHollyWEST23    171
twebbstack         169
Broooooke_         167
linnetwoods        164
karinb_za          162
TraceyHewins       158
JBnVFCLover786     157
insearchofnkotb    156
Djalfy             156
cookiemonster82    155
maynaseric         151
mrs_mcsupergirl    148
MiDesfileNegro     147
bigenya            146
patriciaco         146
hollyalyxfinch     143
michxxblc          143
jaybranch          142
DonniesGirl69      142
lesley007          142
vacant_heart       142
whitsundays

In [12]:
len(data)

1451758

In [13]:
pd.options.display.max_colwidth = 200
data.text.sample(20)

591074                                                      Dammit! I mis-delete my past update and I forgot what it is about! 
238169                       REF LOL! but there so good, i wish i went on thursday  to see them, just so i can see them again! 
1074679                                                                  REF Wut?! People are so... weird. You are okay to me. 
1417225                                           REF LOL. I can't believe they found that many people who can dance like that 
16313      REF Erm can we please not mention Cadbury's choc...... I moved from England to California and really miss Cadbury's 
213284                                                                         phone stolen and no more road trip next weekend 
810259                                                                                              REF Well - I do try!  haha!
262646                                                                                    REF URL - Sorr

In [14]:
pip install googletrans==4.0.0-rc1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting googletrans==4.0.0-rc1
  Downloading googletrans-4.0.0rc1.tar.gz (20 kB)
Collecting httpx==0.13.3
  Downloading httpx-0.13.3-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 4.3 MB/s 
[?25hCollecting sniffio
  Downloading sniffio-1.3.0-py3-none-any.whl (10 kB)
Collecting httpcore==0.9.*
  Downloading httpcore-0.9.1-py3-none-any.whl (42 kB)
[K     |████████████████████████████████| 42 kB 1.5 MB/s 
Collecting rfc3986<2,>=1.3
  Downloading rfc3986-1.5.0-py2.py3-none-any.whl (31 kB)
Collecting hstspreload
  Downloading hstspreload-2022.11.1-py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 48.2 MB/s 
Collecting h2==3.*
  Downloading h2-3.2.0-py2.py3-none-any.whl (65 kB)
[K     |████████████████████████████████| 65 kB 4.0 MB/s 
[?25hCollecting h11<0.10,>=0.8
  Downloading h11-0.9.0-py2.py3-none-any.whl (53 kB)
[K     |█████████

In [15]:
"""from tqdm import tqdm

from googletrans import Translator
translator = Translator()
detection = translator.detect("Ohhh que journÃ©e Grise")
detection.lang

data_sample = data.sample(1000)
tweets = list(data_sample.text.values)
langs = [translator.detect(t).lang for t in tqdm(tweets)]
data_noeng = data_sample.loc[(data_sample.n_words > 10) & (pd.Series(langs) != 'en')]"""

'from tqdm import tqdm\n\nfrom googletrans import Translator\ntranslator = Translator()\ndetection = translator.detect("Ohhh que journÃ©e Grise")\ndetection.lang\n\ndata_sample = data.sample(1000)\ntweets = list(data_sample.text.values)\nlangs = [translator.detect(t).lang for t in tqdm(tweets)]\ndata_noeng = data_sample.loc[(data_sample.n_words > 10) & (pd.Series(langs) != \'en\')]'

In [16]:
output_path = os.path.join(data_dir, "preprocessed_data.csv")
data.to_csv(output_path, index=False)

In [17]:
data.loc[data.user == 'lost_dog']

Unnamed: 0,target,ids,date,flag,user,text,n_words


In [18]:
print(data.user.value_counts()[:50])

VioletsCRUK        272
SallytheShizzle    261
mcraddictal        247
tsarnick           245
SongoftheOss       214
shanajaca          210
DarkPiano          208
keza34             207
nuttychris         202
thisgoeshere       200
StDAY              197
ramdomthoughts     194
Jayme1988          191
felicityfuller     190
Dogbook            188
Spidersamm         187
Karen230683        184
Dutchrudder        177
JessMcFlyxxx       177
enamoredsoul       174
Quimo              173
torilovesbradie    172
MTVnHollyWEST23    171
twebbstack         169
Broooooke_         167
linnetwoods        164
karinb_za          162
TraceyHewins       158
JBnVFCLover786     157
insearchofnkotb    156
Djalfy             156
cookiemonster82    155
maynaseric         151
mrs_mcsupergirl    148
MiDesfileNegro     147
bigenya            146
patriciaco         146
hollyalyxfinch     143
michxxblc          143
jaybranch          142
DonniesGirl69      142
lesley007          142
vacant_heart       142
whitsundays