# 1_data_processor.ipynb

Processes data from each subreddit and combines them.

In [1]:
import pandas as pd
import os
import csv
from pprint import pprint

In [2]:
input_folder = 'data_collection'
output_folder = 'data_combination'
os.makedirs(output_folder, exist_ok=True)

subreddit_list = ['cancer', 'COVID19positive', 'diabetes', 'eczema', 'eyetriage', 'GERD', 'STD']

# Add subreddit labels

In [10]:
data_cancer = pd.read_csv(input_folder + '/' + 'cancer.csv')
data_cancer['Subreddit'] = data_cancer.apply(lambda row: 'cancer', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'cancer':
        data_cancer[subreddit] = data_cancer.apply(lambda row: int(1), axis=1)
    else:
        data_cancer[subreddit] = data_cancer.apply(lambda row: int(0), axis=1)
data_cancer.to_csv(output_folder + '/' + 'cancer.csv', sep=',', encoding='utf-8', header=True)

data_COVID19positive = pd.read_csv(input_folder + '/' + 'COVID19positive.csv')
data_COVID19positive['Subreddit'] = data_COVID19positive.apply(lambda row: 'COVID19positive', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'COVID19positive':
        data_COVID19positive[subreddit] = data_COVID19positive.apply(lambda row: int(1), axis=1)
    else:
        data_COVID19positive[subreddit] = data_COVID19positive.apply(lambda row: int(0), axis=1)
data_COVID19positive.to_csv(output_folder + '/' + 'COVID19positive.csv', sep=',', encoding='utf-8', header=True)

data_diabetes = pd.read_csv(input_folder + '/' + 'diabetes.csv')
data_diabetes['Subreddit'] = data_diabetes.apply(lambda row: 'diabetes', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'diabetes':
        data_diabetes[subreddit] = data_diabetes.apply(lambda row: int(1), axis=1)
    else:
        data_diabetes[subreddit] = data_diabetes.apply(lambda row: int(0), axis=1)
data_diabetes.to_csv(output_folder + '/' + 'diabetes.csv', sep=',', encoding='utf-8', header=True)

data_eczema = pd.read_csv(input_folder + '/' + 'eczema.csv')
data_eczema['Subreddit'] = data_eczema.apply(lambda row: 'eczema', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'eczema':
        data_eczema[subreddit] = data_eczema.apply(lambda row: int(1), axis=1)
    else:
        data_eczema[subreddit] = data_eczema.apply(lambda row: int(0), axis=1)
data_eczema.to_csv(output_folder + '/' + 'eczema.csv', sep=',', encoding='utf-8', header=True)

data_eyetriage = pd.read_csv(input_folder + '/' + 'eyetriage.csv')
data_eyetriage['Subreddit'] = data_eyetriage.apply(lambda row: 'eyetriage', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'eyetriage':
        data_eyetriage[subreddit] = data_eyetriage.apply(lambda row: int(1), axis=1)
    else:
        data_eyetriage[subreddit] = data_eyetriage.apply(lambda row: int(0), axis=1)
data_eyetriage.to_csv(output_folder + '/' + 'eyetriage.csv', sep=',', encoding='utf-8', header=True)

data_GERD = pd.read_csv(input_folder + '/' + 'GERD.csv')
data_GERD['Subreddit'] = data_GERD.apply(lambda row: 'GERD', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'GERD':
        data_GERD[subreddit] = data_GERD.apply(lambda row: int(1), axis=1)
    else:
        data_GERD[subreddit] = data_GERD.apply(lambda row: int(0), axis=1)
data_GERD.to_csv(output_folder + '/' + 'GERD.csv', sep=',', encoding='utf-8', header=True)

data_STD = pd.read_csv(input_folder + '/' + 'STD.csv')
data_STD['Subreddit'] = data_STD.apply(lambda row: 'STD', axis=1)
for subreddit in subreddit_list:
    if subreddit == 'STD':
        data_STD[subreddit] = data_STD.apply(lambda row: int(1), axis=1)
    else:
        data_STD[subreddit] = data_STD.apply(lambda row: int(0), axis=1)
data_STD.to_csv(output_folder + '/' + 'STD.csv', sep=',', encoding='utf-8', header=True)

# Adjust dataset size to make them more balanced if needed
If the sample size of some subreddits is too big, CNN model will result in NaN when evaluating loss during training.
We try to balance the sample size to be from 2k to 6k.

In [11]:

data_cancer = data_cancer.truncate(before=5000)
print('cancer')
print(data_cancer.shape)

print('COVID19positive')
print(data_COVID19positive.shape)

data_diabetes = data_diabetes.truncate(before=4000)
print('diabetes')
print(data_diabetes.shape)

data_eczema = data_eczema.truncate(before=1000)
print('eczema')
print(data_eczema.shape)

print('eyetriage')
print(data_eyetriage.shape)

print('GERD')
print(data_GERD.shape)

data_STD = data_STD.truncate(before=7000)
print('STD')
print(data_STD.shape)

cancer
(6085, 16)
COVID19positive
(4921, 16)
diabetes
(6695, 16)
eczema
(6983, 16)
eyetriage
(2082, 16)
GERD
(10657, 16)
STD
(6348, 16)


# Combine the datasets

In [12]:
combined_output_csv = output_folder + '/' + 'Combined.csv'
combined_output_excel = output_folder + '/' + 'Combined.xlsx'

data_combined = pd.concat([data_cancer, data_COVID19positive, data_diabetes, 
                           data_eczema, data_eyetriage, data_GERD, data_STD])

print(data_combined.shape)

data_combined.to_csv(combined_output_csv, sep=',', encoding='utf-8', header=True)

(43771, 16)
