In [79]:
import pandas as pd
import numpy as np
import re

In [80]:
np.random.seed(42)

In [81]:
df = pd.read_csv('eng_reviews.csv')
df.drop(['Unnamed: 0', 'language'], axis=1, inplace=True)

In [82]:
df.head()

Unnamed: 0,review
0,"A fun reinvention of the Pokémon formula, hope..."
1,"A fun, challenging (at times), Pokemon game wi..."
2,"Best pokemon game of all time! New pokemon, ne..."
3,"One outstanding experience, that is certain. I..."
4,Incredible game that moves the Pokemon series ...


In [83]:
df = df['review'].str.lower()

In [84]:
df = df.to_frame()

In [85]:
df.head()

Unnamed: 0,review
0,"a fun reinvention of the pokémon formula, hope..."
1,"a fun, challenging (at times), pokemon game wi..."
2,"best pokemon game of all time! new pokemon, ne..."
3,"one outstanding experience, that is certain. i..."
4,incredible game that moves the pokemon series ...


In [97]:
remove_symbols('hola! ()')

'hola    '

In [86]:
def remove_symbols(string):
    return re.sub(r'[^\w]', ' ', string)
    #return re.sub("[^A-Za-z]+","",string)

In [87]:
df = df['review'].apply(remove_symbols)
df = df.to_frame()

In [98]:
df.head()

Unnamed: 0,review
0,a fun reinvention of the pokémon formula hope...
1,a fun challenging at times pokemon game wi...
2,best pokemon game of all time new pokemon ne...
3,one outstanding experience that is certain i...
4,incredible game that moves the pokemon series ...


In [89]:
df.to_csv('cleaned_data.csv')

In [90]:
reviews = df.unstack().values

In [91]:
len(reviews)

836

In [92]:
reviews

array(['a fun reinvention of the pokémon formula  hopefully the next version keeps this change while giving it a better look',
       'a fun  challenging  at times   pokemon game with an addictive gameplay loop and an interesting system of fighting and catching pokemon  it s a huge step in the right direction  the graphics  performance  and multiplayer options need work  but those are the only major flaws  the story  pacing  battles  catching mechanics  exploration  and general gameplay are such an improvement over sword shield and sun moon there are major series changes that make big differences in quality of life  such as no limit to the number of moves a pokemon can learn  they just have 4 memorized pre battles  and not having to trade to evolve certain pokemon anymore  using rare items on them instead   exploration  crafting  gathering  and finding rare  and shiny  pokemon have never been this fun  gone is much of the tedium and boredom found in previous games if you care only abou

In [93]:
reviews[4]

'incredible game that moves the pokemon series forward in a fresh new direction  the gameplay loop is fun  and while the graphics aren t great  they are more than fine  gameplay should always take precedent over graphics  and legends  arceus is a joy to play '

In [49]:
corpus = dict()
for review in reviews:
    for char in review.split(' '):
        if char not in corpus:
            
            corpus[char] = 1
        else:
            corpus[char] += 1

In [50]:
corpus

{'a': 2682,
 'fun': 255,
 'reinvention': 1,
 'of': 1869,
 'the': 5569,
 'pokémon': 670,
 'formula,': 17,
 'hopefully': 20,
 'next': 73,
 'version': 15,
 'keeps': 10,
 'this': 1346,
 'change': 46,
 'while': 123,
 'giving': 36,
 'it': 1196,
 'better': 164,
 'look': 118,
 'fun,': 40,
 'challenging': 11,
 '(at': 3,
 'times),': 1,
 'pokemon': 1053,
 'game': 1744,
 'with': 709,
 'an': 285,
 'addictive': 17,
 'gameplay': 234,
 'loop': 50,
 'and': 2771,
 'interesting': 44,
 'system': 69,
 'fighting': 22,
 'catching': 116,
 'pokemon.': 53,
 "it's": 413,
 'huge': 51,
 'step': 102,
 'in': 1293,
 'right': 98,
 'direction!': 2,
 'graphics,': 58,
 'performance,': 5,
 'multiplayer': 8,
 'options': 12,
 'need': 82,
 'work,': 3,
 'but': 949,
 'those': 53,
 'are': 862,
 'only': 194,
 'major': 15,
 'flaws.': 2,
 'story,': 38,
 'pacing,': 2,
 'battles,': 36,
 'mechanics,': 26,
 'exploration,': 4,
 'general': 10,
 'such': 59,
 'improvement': 20,
 'over': 101,
 'sword/shield': 6,
 'sun/moon.there': 1,
 'ser

In [100]:
word_id = dict()
cnt = 0
for review in reviews:
    for char in review.split(' '):
        if char not in word_id:
            word_id[char] = cnt
            cnt += 1

In [101]:
word_id

{'a': 0,
 'fun': 1,
 'reinvention': 2,
 'of': 3,
 'the': 4,
 'pokémon': 5,
 'formula': 6,
 '': 7,
 'hopefully': 8,
 'next': 9,
 'version': 10,
 'keeps': 11,
 'this': 12,
 'change': 13,
 'while': 14,
 'giving': 15,
 'it': 16,
 'better': 17,
 'look': 18,
 'challenging': 19,
 'at': 20,
 'times': 21,
 'pokemon': 22,
 'game': 23,
 'with': 24,
 'an': 25,
 'addictive': 26,
 'gameplay': 27,
 'loop': 28,
 'and': 29,
 'interesting': 30,
 'system': 31,
 'fighting': 32,
 'catching': 33,
 's': 34,
 'huge': 35,
 'step': 36,
 'in': 37,
 'right': 38,
 'direction': 39,
 'graphics': 40,
 'performance': 41,
 'multiplayer': 42,
 'options': 43,
 'need': 44,
 'work': 45,
 'but': 46,
 'those': 47,
 'are': 48,
 'only': 49,
 'major': 50,
 'flaws': 51,
 'story': 52,
 'pacing': 53,
 'battles': 54,
 'mechanics': 55,
 'exploration': 56,
 'general': 57,
 'such': 58,
 'improvement': 59,
 'over': 60,
 'sword': 61,
 'shield': 62,
 'sun': 63,
 'moon': 64,
 'there': 65,
 'series': 66,
 'changes': 67,
 'that': 68,
 'make

In [102]:
cnt

6380

In [67]:
def one_hot_encode(word_id, corpus_size):
    vector = [0] * corpus_size
    vector[word_id] = 1
    return vector

In [68]:
def concat(*iterables):
    for iterable in iterables:
        yield from iterable

In [98]:
def generate_training_data(corpus, word_id, window_size):
    X, y = list(), list()
    num_words = len(corpus)
    
    for i in range(num_words):
        idx = concat(
            range(max(0, i-window_size), i), range(i, min(num_words, i + window_size + 1))
        )
        for j in idx:
            if i == j:
                continue
            X.append(one_hot_encode(word_id[corpus[i]], num_words))
            y.append(one_hot_encode(word_id[corpus[j]], num_words))
    return np.asarray(X), np.asarray(y)

In [99]:
X, y = generate_training_data(reviews[0].split(' '), word_id, 2)

In [100]:
X

array([[1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0]])

In [101]:
y

array([[0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])