In [1]:
# import packages
import re
import spacy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter, defaultdict, deque
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.metrics import roc_curve, roc_auc_score, confusion_matrix, classification_report
from sklearn.metrics import precision_recall_curve
%matplotlib inline

# load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load('en_core_web_sm')
import string
punctuations = string.punctuation
from  spacy.lang.en.stop_words import STOP_WORDS
# Creating a Spacy Parser
from spacy.lang.en import English
from stemming.porter2 import stem
from html import unescape

In [2]:
vg = pd.read_json('../Amazon_Data/Video_Games_5.json.gz', lines=True, compression='gzip')

In [3]:
vg.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


In [4]:
vg = vg.loc[:,['overall', 'reviewText']]

## Notes
- 'asin' = product ID
- 'reviewText' = the review text
- 'overall' = the star rating

In [5]:
vg.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497577 entries, 0 to 497576
Data columns (total 2 columns):
overall       497577 non-null int64
reviewText    497419 non-null object
dtypes: int64(1), object(1)
memory usage: 7.6+ MB


In [6]:
# clean up nan values and change datatype
vg = vg.dropna(how='any')
vg.loc[:,'overall'] = vg.overall.astype('int16')

In [47]:
vg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497419 entries, 0 to 497576
Data columns (total 3 columns):
overall         497419 non-null int16
reviewText      497419 non-null object
pt_sentiment    497419 non-null int16
dtypes: int16(2), object(1)
memory usage: 9.5+ MB


In [8]:
vg.shape

(497419, 2)

In [9]:
vg.overall.value_counts()

5    299623
4     93644
3     49140
1     30879
2     24133
Name: overall, dtype: int64

In [10]:
# map the sentiment
vg.loc[:,'sentiment'] = vg.overall.map({1: 1, 2: 1, 3: 2, 4: 3, 5: 3}).astype('category')

In [11]:
# map the sentiment
vg.loc[:,'pt_sentiment'] = vg.overall.map({1: 0, 2: 0, 3: 1, 
                                           4: 1, 5: 1}).astype('int16')

In [12]:
vg.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 497419 entries, 0 to 497576
Data columns (total 3 columns):
overall         497419 non-null int16
reviewText      497419 non-null object
pt_sentiment    497419 non-null int16
dtypes: int16(2), object(1)
memory usage: 9.5+ MB


In [14]:
vg.pt_sentiment.value_counts()

1    442407
0     55012
Name: pt_sentiment, dtype: int64

In [15]:
# split the data for fasttext
train_text, test_text, train_labels, test_labels = train_test_split(vg.reviewText, 
                                                                    vg.pt_sentiment, 
                                                                    test_size=0.25,
                                                                    random_state=42, 
                                                                    stratify=vg.pt_sentiment)

In [17]:
from flair.models import TextClassifier
classifier = TextClassifier.load('en-sentiment')
from flair.data import Sentence

scores = defaultdict(float)
values = defaultdict(str)
i = 0

2020-02-24 08:36:46,985 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/classy-imdb-en-rnn-cuda%3A0/imdb-v0.4.pt not found in cache, downloading to C:\Users\CHUCK~1.TUC\AppData\Local\Temp\tmpk_ntfis4


100%|██████████████████████████████████████████████████████████████| 1501979561/1501979561 [22:35<00:00, 1107869.54B/s]


2020-02-24 08:59:23,341 copying C:\Users\CHUCK~1.TUC\AppData\Local\Temp\tmpk_ntfis4 to cache at C:\Users\chuck.tucker\.flair\models\imdb-v0.4.pt
2020-02-24 08:59:29,437 removing temp file C:\Users\CHUCK~1.TUC\AppData\Local\Temp\tmpk_ntfis4
2020-02-24 08:59:29,569 loading file C:\Users\chuck.tucker\.flair\models\imdb-v0.4.pt


In [43]:
len(test_text)

124355

In [44]:
# a list of your sentences
# sentences = [Sentence(text) for text in test_text]
sentences1 = []
sentences2 = []
sentences3 = []
sentences4 = []
sentences5 = []
test_text1 = test_text[:100000]
test_text2 = test_text[100000:200000]
test_text3 = test_text[200000:300000]
test_text4 = test_text[300000:400000]
test_text5 = test_text[400000:]
print(len(test_text1) + len(test_text2) + len(test_text3) + len(test_text4) + len(test_text5))

124355


In [None]:
sentences1 = [Sentence(text) for text in test_text1]

In [45]:
# create default dicts for predictions
from collections import defaultdict

scores = defaultdict(float)
values = defaultdict(str)

In [22]:
# predict for all sentences
classifier.predict(sentences1, mini_batch_size=32)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 134217728 bytes. Buy new RAM!


In [None]:
# check predictions
i = 0
for sentence in sentences1:
        scores[i] = sentence.labels[0].score
        values[i] = sentence.labels[0].value
        i+=1

In [None]:
test_predictions = pd.DataFrame({'probability': scores, 'prediction': values})

In [None]:
# append the other sentencesi results when finished