In [4]:
import pandas as pd
import numpy as np

import re
import emoji

from sklearn.model_selection import train_test_split

def cleaner(text):
    text = str(text).lower() # Set all the words in lower case
    text = re.sub(r'\n', ' ', text) # Remove newline characters
    text = re.sub("@[A-Za-z0-9]+","",text) # Remove usernames
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) # Remove the URLs
    text = " ".join(text.split()) # Remove extra white spaces
    text = ''.join(c for c in text if c not in emoji.EMOJI_DATA) # Remove emojis
    text = text.replace("#", "").replace("_", " ")
    text = text.replace("http", "")
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespaces
    
    return text

df = pd.read_csv("original.csv")
df['text'] = df['text'].map(lambda x: cleaner(x))

dataset1, dataset2, dataset3 = np.array_split(df, 3)

dataset1_train, dataset1_test = train_test_split(dataset1, test_size=0.2, random_state=2122542)
dataset1_train.to_csv("datasets/dataset1_train.csv", index=False)
dataset1_test.to_csv("datasets/dataset1_test.csv", index=False)

dataset2_train, dataset2_test = train_test_split(dataset2, test_size=0.2, random_state=2122542)
dataset2_train.to_csv("datasets/dataset2_train.csv", index=False)
dataset2_test.to_csv("datasets/dataset2_test.csv", index=False)

dataset3_train, dataset3_test = train_test_split(dataset3, test_size=0.2, random_state=2122542)
dataset3_train.to_csv("datasets/dataset3_train.csv", index=False)
dataset3_test.to_csv("datasets/dataset3_test.csv", index=False)

df_train, df_test = train_test_split(df, test_size=0.2)
df_train.to_csv("datasets/original_train.csv", index=False)
df_test.to_csv("datasets/original_test.csv", index=False)