# Import modules

In [1]:
import os
import re

import pandas as pd
from nltk.tokenize import TweetTokenizer
from urllib.parse import urlparse

# Data import

In [2]:
DATA_PATH = os.path.join('..', 'data', 'original')
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
validation = pd.read_csv(os.path.join(DATA_PATH, 'validation.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))


def remove_rt(tweet):
    tweet = re.sub(r'^RT[\s]+', '<Retweet>', tweet)
    return tweet


def remove_hyperlinks(tweet):
    tweet = re.sub(r'https?://[^\s\n\r]+', '<URL>', tweet)
    return tweet


def remove_mention(tweet):
    tweet = re.sub(r'@\w+', '<MENTION>', tweet)
    return tweet


def tokenize(tweet):
    tokenizer = TweetTokenizer(strip_handles=True)
    tweet = tokenizer.tokenize(tweet)
    return tweet


def replace_hyperlinks(tweet):
    http_search = re.search(r'https?://[^\s\n\r]+', tweet)
    if http_search:
        http = http_search.group(0)
        parsed = urlparse(http)
        basepath = parsed.netloc
        tweet = re.sub(r'https?://[^\s\n\r]+', basepath, tweet)
    return tweet


def preprocess_simple(tweet):
    tweet = remove_rt(tweet)
    tweet = remove_hyperlinks(tweet)
    tweet = remove_mention(tweet)
    tweet = tokenize(tweet)
    tweet = ' '.join(tweet)
    return tweet

new_train = train.copy()
new_validation = validation.copy()
new_test = test.copy()

new_train['text'] = train['text'].apply(preprocess_simple)
new_validation['text'] = validation['text'].apply(preprocess_simple)
new_test['text'] = test['text'].apply(preprocess_simple)
new_train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths <URL>,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [3]:
PREPROCESSED_DATA_PATH = os.path.join("..", "data", "preprocessed_url_simple")
if not os.path.exists(PREPROCESSED_DATA_PATH):
    os.mkdir(PREPROCESSED_DATA_PATH)
new_train.to_csv(os.path.join(PREPROCESSED_DATA_PATH, 'train.csv'), index=False)
new_validation.to_csv(os.path.join(PREPROCESSED_DATA_PATH, 'validation.csv'), index=False)
new_train.to_csv(os.path.join(PREPROCESSED_DATA_PATH, "test.csv"), index=False)

In [4]:
def preprocess_complex(tweet):
    tweet = replace_hyperlinks(tweet)
    tweet = remove_mention(tweet)
    tweet = tokenize(tweet)
    tweet = ' '.join(tweet)
    return tweet

In [5]:
new_train['text'] = train['text'].apply(preprocess_complex)
new_validation['text'] = validation['text'].apply(preprocess_complex)
new_test['text'] = test['text'].apply(preprocess_complex)
new_train.head()

Unnamed: 0,screen_name,text,account.type,class_type
0,bot#9,YEA now that note GOOD,bot,others
1,human#17,Listen to This Charming Man by The Smiths t.co,human,human
2,bot#23,wish i can i would be seeing other hoes on the...,bot,others
3,bot#1,The decade in the significantly easier schedul...,bot,others
4,bot#11,""" Theim class =\ "" alignnone size-full wp-imag...",bot,rnn


In [6]:
PREPROCESSED_BASEPATH_PATH = os.path.join("..", "data", "preprocessed_url_complex")
if not os.path.exists(PREPROCESSED_BASEPATH_PATH):
    os.mkdir(PREPROCESSED_BASEPATH_PATH)
train.to_csv(os.path.join(PREPROCESSED_BASEPATH_PATH, 'train.csv'), index=False)
validation.to_csv(os.path.join(PREPROCESSED_BASEPATH_PATH, 'validation.csv'), index=False)
test.to_csv(os.path.join(PREPROCESSED_BASEPATH_PATH, "test.csv"), index=False)