In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import os



In [2]:
df = pd.read_csv('data/elonmusk.csv')
df

Unnamed: 0,UserScreenName,UserName,Timestamp,Text,Embedded_text,Emojis,Comments,Likes,Retweets,Image link,Tweet URL
0,Elon Musk,@elonmusk,2010-06-04T18:31:57.000Z,"Elon Musk\n@elonmusk\n·\nJun 4, 2010","Please ignore prior tweets, as that was someon...",,796,593,5.2K,[],https://twitter.com/elonmusk/status/15434727182
1,Elon Musk,@elonmusk,2011-12-01T10:29:04.000Z,"Elon Musk\n@elonmusk\n·\nDec 1, 2011",I made the volume on the Model S http://ow.ly/...,,30,16,78,[],https://twitter.com/elonmusk/status/1421884581...
2,Elon Musk,@elonmusk,2011-12-01T09:55:11.000Z,"Elon Musk\n@elonmusk\n·\nDec 1, 2011",Went to Iceland on Sat to ride bumper cars on ...,,30,22,191,['https://pbs.twimg.com/media/Afkf5DMCAAEJGFr?...,https://twitter.com/elonmusk/status/1421799282...
3,Elon Musk,@elonmusk,2011-12-03T08:22:07.000Z,"Elon Musk\n@elonmusk\n·\nDec 3, 2011",That was a total non sequitur btw\n25\n13\n50,,25,13,50,[],https://twitter.com/elonmusk/status/1428812840...
4,Elon Musk,@elonmusk,2011-12-03T08:20:28.000Z,"Elon Musk\n@elonmusk\n·\nDec 3, 2011","Great Voltaire quote, arguably better than Twa...",,23,29,44,[],https://twitter.com/elonmusk/status/1428808713...
...,...,...,...,...,...,...,...,...,...,...,...
3188,Elon Musk,@elonmusk,2021-10-08T09:04:45.000Z,Elon Musk\n@elonmusk\n·\nOct 8,Please consider joining Tesla AI software or h...,,7.5K,5.8K,71.9K,['https://pbs.twimg.com/media/FBKmLrvWYA8q4_J?...,https://twitter.com/elonmusk/status/1446401165...
3189,Elon Musk,@elonmusk,2021-10-09T08:16:56.000Z,Elon Musk\n@elonmusk\n·\nOct 9,Giga Berlin-Brandenburg Spaß-Party heute!\n5.3...,,5.3K,4.7K,85.3K,[],https://twitter.com/elonmusk/status/1446751520...
3190,Elon Musk,@elonmusk,2021-10-12T01:17:01.000Z,Elon Musk\n@elonmusk\n·\nOct 11,Starship Launch Site\n0:07\n3.3M views\n9.8K\n...,,9.8K,9.8K,105.9K,[],https://twitter.com/elonmusk/status/1447733010...
3191,Elon Musk,@elonmusk,2021-10-12T01:17:01.000Z,Elon Musk\n@elonmusk\n·\nOct 11,Starship Launch Site\n0:05\n3.3M views\n9.8K\n...,,9.8K,9.8K,105.9K,[],https://twitter.com/elonmusk/status/1447733010...


In [154]:
def preprocess(text):
    url_pattern = r"https?://\S+|www\.\S+"
    link_pattern = r"\w+.(com|org|gov)\n.*\n.*"
    video_metadata_pattern = r"(\d+(\.\d+)?(K|M)?) views.*"
    video_duration_pattern = r"\d:\d+"
    metadata_pattern = r"\n(\d+(\.\d+)?(K|M)?)\n(\d+(\.\d+)?(K|M)?)(\n(\d+(.\d+)?(K|M)?))?"
    only_metdata_pattern = r"(\d+(\.\d+)?(K|M)?)\n(\d+(\.\d+)?(K|M)?)\n(\d+(\.\d+)?(K|M)?)"
    thread_pattern = r"Show this thread"
    embedded_tweet_pattern = r"\n(.*)\n@\w*\n · .*"
    tweet_unavailable_pattern = r"This Tweet is unavailable."
    poll_pattern = r"\n.*\n(\d+(\.\d+)?)%.*"
    poll_result_pattern = r"\d{1,3}(,\d{3})*(\.\d+)? votes\n·\nFinal results"

    text = re.sub(embedded_tweet_pattern, "", text)
    text = re.sub(tweet_unavailable_pattern, "", text)
    text = re.sub(url_pattern, "", text)
    text = re.sub(metadata_pattern, "", text)
    text = re.sub(only_metdata_pattern, "", text)
    text = re.sub(video_metadata_pattern, "", text)
    text = re.sub(video_duration_pattern, "", text)
    text = re.sub(thread_pattern, "", text)
    text = re.sub(poll_pattern, "", text)
    text = re.sub(poll_result_pattern, "", text)
    text = re.sub(link_pattern, "", text)
    text = re.sub("/", "", text)
    text = text.strip()

    return text

In [155]:
def write_text_file(df, dest_path: str, file_name: str):
    if not os.path.exists(dest_path):
        os.mkdir(dest_path)
    file_path = f"{dest_path}/{file_name}"
    f = open(file_path, 'w')
    data = ''
    tweets = df['text'].tolist()
    for tweet in tweets:
        if not len(tweet):
            continue
        
        tweet = re.sub(r"\s", " ", tweet)
        bos_token = '<BOS>'
        eos_token = '<EOS>'
        data += bos_token + ' ' + tweet + ' ' + eos_token + '\n'
    f.write(data)

def build_dataset(username: str, src='data', dest='data'):
    df = pd.read_csv(f'{src}/{username}.csv')
    is_not_rt = ~df["Embedded_text"].str.contains("RT")
    df = df[is_not_rt]
    is_not_reply = ~df["Embedded_text"].str.contains("Replying to")
    df = df[is_not_reply]
    df['text'] = df['Embedded_text'].apply(preprocess)
    
    train_test_ratio = 0.9
    train_valid_ratio = 7 / 9
    df_full_train, df_test = train_test_split(df, train_size=train_test_ratio, random_state=1)
    df_train, df_valid = train_test_split(df_full_train, train_size=train_valid_ratio, random_state=1)
    
    dest_path = f'{dest}/{username}'
    write_text_file(df_train, dest_path, 'train.txt')
    write_text_file(df_valid, dest_path, 'valid.txt')
    write_text_file(df_test, dest_path, 'test.txt')

In [157]:
# This will write the processed, split data to data/<username> with special start and end tokens
build_dataset('elonmusk')
build_dataset('BarackObama')
build_dataset('karpathy')
build_dataset('cnn')
build_dataset('wsj')