In [1]:
import pandas as pd
import random
import re

In [2]:
def preprocess(text):
    # Remove '<user>'
    text = re.sub(r'<user>', '', text)
    # Remove '<url>'
    text = re.sub(r'<url>', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove \n
    text = re.sub(r'\n', '', text)
    # remove beginning and ending spaces
    text = text.strip()
    return text

## Preprocessing for training data

In [45]:
pos_path = 'data/twitter-datasets/train_pos_full.txt'
neg_path = 'data/twitter-datasets/train_neg_full.txt'

with open(pos_path, 'r') as f:
    pos_tweets = f.readlines()
with open(neg_path, 'r') as f:
    neg_tweets = f.readlines()

# preprocess data
pos_tweets = [preprocess(tweet) for tweet in pos_tweets]
neg_tweets = [preprocess(tweet) for tweet in neg_tweets]


pos_labels = [1 for _ in range(len(pos_tweets))]
neg_labels = [0 for _ in range(len(neg_tweets))]
labels = pos_labels + neg_labels
tweets = pos_tweets + neg_tweets
lengths = [len(tweet) for tweet in tweets]
max_length =  min(max(lengths), 200)
lengths = [length/max_length for length in lengths]

# create dataframe
df = pd.DataFrame({'tweet': tweets, 'label': labels, 'length': lengths})
# shuffle dataframe
df = df.sample(frac=1).reset_index(drop=True)
df['fold'] = random.choices(range(1,6), k=len(df))

In [46]:
df.head()

Unnamed: 0,tweet,label,length,fold
0,"i know but it's hard right now , i'm stressed ...",0,0.265,3
1,why does it do that ?,0,0.105,3
2,why did i have to wake up ?,0,0.135,3
3,true that ! puro vega ila krn ) ) ),1,0.18,2
4,you my boo thang,1,0.08,1


In [47]:
df['fold'].value_counts()

fold
4    500435
5    500382
3    499828
1    499769
2    499586
Name: count, dtype: int64

In [48]:
df['label'].value_counts()

label
0    1250000
1    1250000
Name: count, dtype: int64

In [49]:
df.to_csv('data/train.tsv', sep='\t', index=False, header=True)

In [50]:
# create a sample of the data for testing
df_sample = df.sample(frac=0.01).reset_index(drop=True)
df_sample.to_csv('data/train_sample.tsv', sep='\t', index=False, header=True)

## Preprocessing for test data

In [3]:
test_path = 'data/twitter-datasets/test_data.txt'
with open(test_path, 'r') as f:
    test_tweets = f.readlines()
test_tweets = [preprocess(tweet) for tweet in test_tweets]
test_lengths = [len(tweet) for tweet in test_tweets]
test_max_length =  min(max(test_lengths), 200)
test_lengths = [length/test_max_length for length in test_lengths]
test_df = pd.DataFrame({'tweet': test_tweets, 'length': test_lengths})
display(test_df.head())


test_df.to_csv('data/test.tsv', sep='\t', index=False, header=True)

Unnamed: 0,tweet,length
0,",sea doo pro sea scooter ( sports with the por...",0.768707
1,", shucks well i work all week so now i can't c...",0.789116
2,",i cant stay away from bug thats my baby",0.272109
3,", no ma'am ! ! ! lol im perfectly fine and not...",0.47619
4,",whenever i fall asleep watching the tv , i al...",0.503401
