In [1]:
import os
import pandas as pd
import sklearn.model_selection

TEXT_COLUMN = 'text'
LABEL_COLUMN = 'label'
SEED = 42
VALIDATION_SIZE = 0.2

In [31]:
train = pd.DataFrame({
  TEXT_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'text_train.txt'), encoding = 'ISO-8859-1', sep = '\t\t\t', header = None).rename(columns = {0: TEXT_COLUMN})[TEXT_COLUMN].apply(lambda x: x.encode('utf-8').decode()),
  LABEL_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'label_train.txt'), sep = '\t\t\t', header = None).rename(columns = {0: LABEL_COLUMN})[LABEL_COLUMN]
}).sample(frac = 1, random_state = SEED).reset_index(drop = True)

test = pd.DataFrame({
  TEXT_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'text_test.txt'), encoding = 'ISO-8859-1', sep = '\t\t\t', header = None).rename(columns = {0: TEXT_COLUMN})[TEXT_COLUMN].apply(lambda x: x.encode('utf-8').decode()),
  LABEL_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'label_test.txt'), sep = '\t\t\t', header = None).rename(columns = {0: LABEL_COLUMN})[LABEL_COLUMN]
}).sample(frac = 1, random_state = SEED).reset_index(drop = True)

  TEXT_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'text_train.txt'), encoding = 'ISO-8859-1', sep = '\t\t\t', header = None).rename(columns = {0: TEXT_COLUMN})[TEXT_COLUMN].apply(lambda x: x.encode('utf-8').decode()),
  LABEL_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'label_train.txt'), sep = '\t\t\t', header = None).rename(columns = {0: LABEL_COLUMN})[LABEL_COLUMN]
  TEXT_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'text_test.txt'), encoding = 'ISO-8859-1', sep = '\t\t\t', header = None).rename(columns = {0: TEXT_COLUMN})[TEXT_COLUMN].apply(lambda x: x.encode('utf-8').decode()),
  LABEL_COLUMN : pd.read_csv(os.path.join('.', 'as_is', 'RTSP-MR', 'label_test.txt'), sep = '\t\t\t', header = None).rename(columns = {0: LABEL_COLUMN})[LABEL_COLUMN]


In [32]:
train, validation = sklearn.model_selection.train_test_split(train, test_size = VALIDATION_SIZE, random_state = SEED, stratify = train[LABEL_COLUMN])

In [33]:
train.dropna(inplace = True)
validation.dropna(inplace = True)
test.dropna(inplace = True)

In [36]:
STANDARDIZED_DATA_PATH = './with_validation_splits/'
os.makedirs(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP'), exist_ok = True)
train.to_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'train.csv'), index = False)
validation.to_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'validation.csv'), index = False)
test.to_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'test.csv'), index = False)

In [3]:
STANDARDIZED_DATA_PATH = './with_validation_splits/'
tr = pd.read_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'train.csv'))
val = pd.read_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'validation.csv'))
tes = pd.read_csv(os.path.join(STANDARDIZED_DATA_PATH, 'RTSP', 'test.csv'))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# Get length for each text using countvectorizer
X = vectorizer.fit_transform(tr['text'])
X_val = vectorizer.fit_transform(val['text'])
X_test = vectorizer.fit_transform(tes['text'])
tr['length'] = X.sum(axis=1)
val['length'] = X_val.sum(axis=1)
tes['length'] = X_test.sum(axis=1)

tr['length'].describe(), val['length'].describe(), tes['length'].describe()

(count    5686.000000
 mean       18.112381
 std         8.460838
 min         1.000000
 25%        12.000000
 50%        18.000000
 75%        24.000000
 max        50.000000
 Name: length, dtype: float64,
 count    1422.000000
 mean       18.139241
 std         8.506587
 min         1.000000
 25%        12.000000
 50%        18.000000
 75%        24.000000
 max        47.000000
 Name: length, dtype: float64,
 count    3554.000000
 mean       18.127743
 std         8.483623
 min         1.000000
 25%        12.000000
 50%        18.000000
 75%        24.000000
 max        46.000000
 Name: length, dtype: float64)

In [5]:
tr['label'].value_counts(normalize = True), val['label'].value_counts(normalize = True), tes['label'].value_counts(normalize = True)

(1    0.5
 0    0.5
 Name: label, dtype: float64,
 1    0.5
 0    0.5
 Name: label, dtype: float64,
 1    0.5
 0    0.5
 Name: label, dtype: float64)