This notebook is used to:
* take random samples of the target domains
* check the distributions of the random samples
* take a random sample with balanced lables (shuffled and unshuffled)
* save the outputs to a csv

# Imports

In [None]:
import pandas as pd

# Constants

In [None]:
FILE_1 = 'games_train'  
FILE_2 = 'sew_train'

# Read in Games Training Set

In [None]:
game_train = pd.read_csv('../data/interim/'+FILE_1+'.csv', dtype='string', header=None)
game_train.columns = ['text', 'label']
game_train.head(2)
#game_train['label'].astype(str)

# Take a random sample

## balanced

In [None]:
# Random Even Shuffled
g_re_00010 = game_train.groupby('label').apply(lambda x: x.sample(n=5, random_state=42)).sample(frac=1)
g_re_00100 = game_train.groupby('label').apply(lambda x: x.sample(n=50, random_state=42)).sample(frac=1)
g_re_01000 = game_train.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).sample(frac=1)
g_re_10000 = game_train.groupby('label').apply(lambda x: x.sample(n=5000, random_state=42)).sample(frac=1)

In [None]:
g_re_00010

## unbalanced

In [None]:
# Random
g_ra_00010 = game_train.sample(n=10, random_state=42).reset_index(drop=True)
g_ra_00100 = game_train.sample(n=100, random_state=42).reset_index(drop=True)
g_ra_01000 = game_train.sample(n=1000, random_state=42).reset_index(drop=True)
g_ra_10000 = game_train.sample(n=10000, random_state=42).reset_index(drop=True)

In [None]:
g_ra_00010

# Check Distributions

### Games unbalanced

In [None]:
games_ra = [g_ra_00010, g_ra_00100, g_ra_01000, g_ra_10000]
games_ra_names = ['g_ra_00010', 'g_ra_00100', 'g_ra_01000', 'g_ra_10000']

In [None]:
for i in range(len(games_ra)):
    corp_len = len(games_ra[i])
    name = games_ra_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if games_ra[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

### Games Balanced

In [None]:
games_re = [g_re_00010, g_re_00100, g_re_01000, g_re_10000]
games_re_names = ['g_re_00010', 'g_re_00100', 'g_re_01000', 'g_re_10000']

In [None]:
for i in range(len(games_re)):
    corp_len = len(games_re[i])
    name = games_re_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if games_re[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

# Save to csv

In [None]:
g_re_00010.to_csv('../data/random/games_res_00010.csv', index=False, header=False)
g_re_00100.to_csv('../data/random/games_res_00100.csv', index=False, header=False)
g_re_01000.to_csv('../data/random/games_res_01000.csv', index=False, header=False)
g_re_10000.to_csv('../data/random/games_res_10000.csv', index=False, header=False)

In [None]:
g_ra_00010.to_csv('../data/random/games_00010.csv', index=False, header=False)
g_ra_00100.to_csv('../data/random/games_00100.csv', index=False, header=False)
g_ra_01000.to_csv('../data/random/games_01000.csv', index=False, header=False)
g_ra_10000.to_csv('../data/random/games_10000.csv', index=False, header=False)

# Read Sew Data Set

In [None]:
sew_train = pd.read_csv('../data/interim/'+FILE_2+'.csv', dtype='string')
sew_train.columns = ['text', 'label']
sew_train.head(2)

# Take a Sample

## balanced

In [None]:
# Sew Random Even Shuffled
s_re_00010 = sew_train.groupby('label').apply(lambda x: x.sample(n=5, random_state=42)).sample(frac=1)
s_re_00100 = sew_train.groupby('label').apply(lambda x: x.sample(n=50, random_state=42)).sample(frac=1)
s_re_01000 = sew_train.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).sample(frac=1)
s_re_10000 = sew_train.groupby('label').apply(lambda x: x.sample(n=5000, random_state=42)).sample(frac=1)

In [None]:
s_re_00010

## unbalanced

In [None]:
# Sew Random Unbalanced
s_ra_00010 = sew_train.sample(n=10, random_state=42).reset_index(drop=True)
s_ra_00100 = sew_train.sample(n=100, random_state=42).reset_index(drop=True)
s_ra_01000 = sew_train.sample(n=1000, random_state=42).reset_index(drop=True)
s_ra_10000 = sew_train.sample(n=10000, random_state=42).reset_index(drop=True)

# Check Distributions

### Sew unbalanced

In [None]:
sew_ra = [s_ra_00010, s_ra_00100, s_ra_01000, s_ra_10000]
sew_ra_names = ['s_ra_00010', 's_ra_00100', 's_ra_01000', 's_ra_10000']

In [None]:
for i in range(len(sew_ra)):
    corp_len = len(sew_ra[i])
    name = sew_ra_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if sew_ra[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

### Sew balanced

In [None]:
sew_re = [s_re_00010, s_re_00100, s_re_01000, s_re_10000]
sew_re_names = ['s_re_00010', 's_re_00100', 's_re_01000', 's_re_10000']

In [None]:
for i in range(len(sew_re)):
    corp_len = len(sew_re[i])
    name = sew_re_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if sew_re[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

# Save to CSV

In [None]:
s_ra_00010.to_csv('../data/random/sew_00010.csv', index=False, header=False)
s_ra_00100.to_csv('../data/random/sew_00100.csv', index=False, header=False)
s_ra_01000.to_csv('../data/random/sew_01000.csv', index=False, header=False)
s_ra_10000.to_csv('../data/random/sew_10000.csv', index=False, header=False)

In [None]:
s_re_00010.to_csv('../data/random/sew_res_00010.csv', index=False, header=False)
s_re_00100.to_csv('../data/random/sew_res_00100.csv', index=False, header=False)
s_re_01000.to_csv('../data/random/sew_res_01000.csv', index=False, header=False)
s_re_10000.to_csv('../data/random/sew_res_10000.csv', index=False, header=False)