# Imports

In [1]:
import pandas as pd

# Constants

In [2]:
FILE_1 = 'games_train'  
FILE_2 = 'sew_train'

# Read in Games Training Set

In [3]:
game_train = pd.read_csv('../data/interim/'+FILE_1+'.csv', dtype='string', header=None)
game_train.columns = ['text', 'label']
game_train.head(2)
#game_train['label'].astype(str)

Unnamed: 0,text,label
0,Five Stars Great Game,1
1,Five Stars Great conditions!!,1


# Take a random sample

## balanced

In [4]:
# Random Even Shuffled
g_re_00010 = game_train.groupby('label').apply(lambda x: x.sample(n=5, random_state=42)).sample(frac=1)
g_re_00100 = game_train.groupby('label').apply(lambda x: x.sample(n=50, random_state=42)).sample(frac=1)
g_re_01000 = game_train.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).sample(frac=1)
g_re_10000 = game_train.groupby('label').apply(lambda x: x.sample(n=5000, random_state=42)).sample(frac=1)

In [5]:
g_re_00010

Unnamed: 0_level_0,Unnamed: 1_level_0,text,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,276170,Wanted to like it..... I wanted so badly to li...,0
0,78051,Marvel to the Rescue? I'm not big on comic boo...,0
1,159528,You know what your getting. Good fun and some ...,1
1,336851,"Graphically superior, but lacks support from W...",1
0,112412,Don't buy it. I cannot believe how worse this ...,0
0,316664,is impossible to create a good looking player ...,0
0,199704,The last Super Nintendo game This is a very un...,0
1,161222,"Sad to see so many down grades on PS3, but it ...",1
1,178221,disappointed in the fan response I am quite di...,1
1,165715,Five Stars Nice and VERY long so the husband c...,1


## unbalanced

In [6]:
# Random
g_ra_00010 = game_train.sample(n=10, random_state=42).reset_index(drop=True)
g_ra_00100 = game_train.sample(n=100, random_state=42).reset_index(drop=True)
g_ra_01000 = game_train.sample(n=1000, random_state=42).reset_index(drop=True)
g_ra_10000 = game_train.sample(n=10000, random_state=42).reset_index(drop=True)

In [7]:
g_ra_00010

Unnamed: 0,text,label
0,"Great game Pretty fun, however i'm more a GoW ...",1
1,"Five Stars A great disc, you can keep using it...",1
2,Five Stars Game is in great condition thanks.,1
3,Five Stars perfect product,1
4,Rocking out This game is classic. The disk wo...,1
5,17-in-1 Bundle Pack For Nintendo DS Lite I bou...,1
6,Moving in a new direction The Good: Fantastic ...,1
7,A perfect gaming companion for your PC Brief ...,1
8,"Five Stars really good game, not my favorite z...",1
9,"Fun! For kids, fun and creative!",1


# Check Distributions

### Games unbalanced

In [8]:
games_ra = [g_ra_00010, g_ra_00100, g_ra_01000, g_ra_10000]
games_ra_names = ['g_ra_00010', 'g_ra_00100', 'g_ra_01000', 'g_ra_10000']

In [9]:
for i in range(len(games_ra)):
    corp_len = len(games_ra[i])
    name = games_ra_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if games_ra[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

For Dataset g_ra_00010
Count Positive: 10
Count Negative: 0
Ratio: 100.0% Positive

For Dataset g_ra_00100
Count Positive: 90
Count Negative: 10
Ratio: 90.0% Positive

For Dataset g_ra_01000
Count Positive: 888
Count Negative: 112
Ratio: 88.8% Positive

For Dataset g_ra_10000
Count Positive: 8743
Count Negative: 1257
Ratio: 87.43% Positive



### Games Balanced

In [10]:
games_re = [g_re_00010, g_re_00100, g_re_01000, g_re_10000]
games_re_names = ['g_re_00010', 'g_re_00100', 'g_re_01000', 'g_re_10000']

In [11]:
for i in range(len(games_re)):
    corp_len = len(games_re[i])
    name = games_re_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if games_re[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

For Dataset g_re_00010
Count Positive: 5
Count Negative: 5
Ratio: 50.0% Positive

For Dataset g_re_00100
Count Positive: 50
Count Negative: 50
Ratio: 50.0% Positive

For Dataset g_re_01000
Count Positive: 500
Count Negative: 500
Ratio: 50.0% Positive

For Dataset g_re_10000
Count Positive: 5000
Count Negative: 5000
Ratio: 50.0% Positive



# Save to csv

In [12]:
g_re_00010.to_csv('../data/random/games_res_00010.csv', index=False, header=False)
g_re_00100.to_csv('../data/random/games_res_00100.csv', index=False, header=False)
g_re_01000.to_csv('../data/random/games_res_01000.csv', index=False, header=False)
g_re_10000.to_csv('../data/random/games_res_10000.csv', index=False, header=False)

In [13]:
g_ra_00010.to_csv('../data/random/games_00010.csv', index=False, header=False)
g_ra_00100.to_csv('../data/random/games_00100.csv', index=False, header=False)
g_ra_01000.to_csv('../data/random/games_01000.csv', index=False, header=False)
g_ra_10000.to_csv('../data/random/games_10000.csv', index=False, header=False)

# Read Sew Data Set

In [14]:
sew_train = pd.read_csv('../data/interim/'+FILE_2+'.csv', dtype='string')
sew_train.columns = ['text', 'label']
sew_train.head(2)

Unnamed: 0,text,label
0,Recommend! I love this paper! I was kinda skep...,1
1,Five Stars This is a great pack filled with ca...,1


# Take a Sample

## balanced

In [15]:
# Sew Random Even Shuffled
s_re_00010 = sew_train.groupby('label').apply(lambda x: x.sample(n=5, random_state=42)).sample(frac=1)
s_re_00100 = sew_train.groupby('label').apply(lambda x: x.sample(n=50, random_state=42)).sample(frac=1)
s_re_01000 = sew_train.groupby('label').apply(lambda x: x.sample(n=500, random_state=42)).sample(frac=1)
s_re_10000 = sew_train.groupby('label').apply(lambda x: x.sample(n=5000, random_state=42)).sample(frac=1)

In [16]:
s_re_00010

Unnamed: 0_level_0,Unnamed: 1_level_0,text,label
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,20451,Don't Waste Your Money Poor quality. The edges...,0
1,357506,Five Stars good,1
1,114726,Five Stars Love my singer sewing,1
0,290954,You can fond cheaper in retail stores. It work...,0
1,253375,"Love these! So adorable, and now a family trad...",1
1,113824,"Five Stars These are so cool, I have made scar...",1
0,18131,One Star So cheaply made.,0
0,51198,One Star didn't fit together right,0
0,69724,They flip flop Not a favorite These can be a l...,0
1,277611,Very nice set Beautiful pastels set. Rich cre...,1


## unbalanced

In [17]:
# Sew Random Unbalanced
s_ra_00010 = sew_train.sample(n=10, random_state=42).reset_index(drop=True)
s_ra_00100 = sew_train.sample(n=100, random_state=42).reset_index(drop=True)
s_ra_01000 = sew_train.sample(n=1000, random_state=42).reset_index(drop=True)
s_ra_10000 = sew_train.sample(n=10000, random_state=42).reset_index(drop=True)

# Check Distributions

### Sew unbalanced

In [18]:
sew_ra = [s_ra_00010, s_ra_00100, s_ra_01000, s_ra_10000]
sew_ra_names = ['s_ra_00010', 's_ra_00100', 's_ra_01000', 's_ra_10000']

In [19]:
for i in range(len(sew_ra)):
    corp_len = len(sew_ra[i])
    name = sew_ra_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if sew_ra[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

For Dataset s_ra_00010
Count Positive: 9
Count Negative: 1
Ratio: 90.0% Positive

For Dataset s_ra_00100
Count Positive: 94
Count Negative: 6
Ratio: 94.0% Positive

For Dataset s_ra_01000
Count Positive: 958
Count Negative: 42
Ratio: 95.8% Positive

For Dataset s_ra_10000
Count Positive: 9414
Count Negative: 586
Ratio: 94.14% Positive



### Sew balanced

In [20]:
sew_re = [s_re_00010, s_re_00100, s_re_01000, s_re_10000]
sew_re_names = ['s_re_00010', 's_re_00100', 's_re_01000', 's_re_10000']

In [21]:
for i in range(len(sew_re)):
    corp_len = len(sew_re[i])
    name = sew_re_names[i]
    pos = 0
    neg = 0
    for j in range(corp_len):
        if sew_re[i]['label'][j] == '1':
            pos +=1
        else:
            neg +=1
    print(f"For Dataset {name}\nCount Positive: {pos}\nCount Negative: {neg}\nRatio: {round(pos/corp_len*100,2)}% Positive\n")

For Dataset s_re_00010
Count Positive: 5
Count Negative: 5
Ratio: 50.0% Positive

For Dataset s_re_00100
Count Positive: 50
Count Negative: 50
Ratio: 50.0% Positive

For Dataset s_re_01000
Count Positive: 500
Count Negative: 500
Ratio: 50.0% Positive

For Dataset s_re_10000
Count Positive: 5000
Count Negative: 5000
Ratio: 50.0% Positive



# Save to CSV

In [22]:
s_ra_00010.to_csv('../data/random/sew_00010.csv', index=False, header=False)
s_ra_00100.to_csv('../data/random/sew_00100.csv', index=False, header=False)
s_ra_01000.to_csv('../data/random/sew_01000.csv', index=False, header=False)
s_ra_10000.to_csv('../data/random/sew_10000.csv', index=False, header=False)

In [23]:
s_re_00010.to_csv('../data/random/sew_res_00010.csv', index=False, header=False)
s_re_00100.to_csv('../data/random/sew_res_00100.csv', index=False, header=False)
s_re_01000.to_csv('../data/random/sew_res_01000.csv', index=False, header=False)
s_re_10000.to_csv('../data/random/sew_res_10000.csv', index=False, header=False)