In [9]:
import pandas as pd
import numpy as np

import re
import emoji

from sklearn.model_selection import train_test_split

def cleaner(text):
    text = str(text).lower() # Set all the words in lower case
    text = re.sub(r'\n', ' ', text) # Remove newline characters
    text = re.sub("@[A-Za-z0-9]+","",text) # Remove usernames
    text = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text) # Remove the URLs
    text = " ".join(text.split()) # Remove extra white spaces
    text = ''.join(c for c in text if c not in emoji.EMOJI_DATA) # Remove emojis
    text = text.replace("#", "").replace("_", " ")
    text = text.replace("http", "")
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra whitespaces
    
    return text

df = pd.read_csv("original.csv")
df['text'] = df['text'].map(lambda x: cleaner(x))

dataset1, dataset2, dataset3 = np.array_split(df, 3)

dataset1_train, dataset1_test = train_test_split(dataset1, test_size=0.2, random_state=2122542)
dataset1_train.to_csv("datasets/dataset1_train.csv", index=False)
dataset1_test.to_csv("datasets/dataset1_test.csv", index=False)

dataset2_train, dataset2_test = train_test_split(dataset2, test_size=0.2, random_state=2122542)
dataset2_train.to_csv("datasets/dataset2_train.csv", index=False)
dataset2_test.to_csv("datasets/dataset2_test.csv", index=False)

dataset3_train, dataset3_test = train_test_split(dataset3, test_size=0.2, random_state=2122542)
dataset3_train.to_csv("datasets/dataset3_train.csv", index=False)
dataset3_test.to_csv("datasets/dataset3_test.csv", index=False)

df_train, df_test = train_test_split(df, test_size=0.2)
df_train.to_csv("datasets/original_train.csv", index=False)
df_test.to_csv("datasets/original_test.csv", index=False)

print("DATASET 1")
print("Train dataset")
print(dataset1_train['labels'].value_counts(sort=False))
print("Test dataset")
print(dataset1_test['labels'].value_counts(sort=False))

print("DATASET 2")
print("Train dataset")
print(dataset2_train['labels'].value_counts(sort=False))
print("Test dataset")
print(dataset2_test['labels'].value_counts(sort=False))

print("DATASET 3")
print("Train dataset")
print(dataset3_train['labels'].value_counts(sort=False))
print("Test dataset")
print(dataset3_test['labels'].value_counts(sort=False))

print("Original")
print("Train dataset")
print(df_train['labels'].value_counts(sort=False))
print("Test dataset")
print(df_test['labels'].value_counts(sort=False))

  return bound(*args, **kwds)


DATASET 1
Train dataset
labels
2    14369
0    13200
1     8579
Name: count, dtype: int64
Test dataset
labels
2    3694
1    2132
0    3212
Name: count, dtype: int64
DATASET 2
Train dataset
labels
1     8650
2    14216
0    13282
Name: count, dtype: int64
Test dataset
labels
0    3314
2    3516
1    2207
Name: count, dtype: int64
DATASET 3
Train dataset
labels
2    14249
0    13221
1     8678
Name: count, dtype: int64
Test dataset
labels
2    3607
0    3227
1    2203
Name: count, dtype: int64
Original
Train dataset
labels
0    39615
2    42890
1    25939
Name: count, dtype: int64
Test dataset
labels
1     6510
0     9841
2    10761
Name: count, dtype: int64
