Imports

In [None]:
# pip install nltk

In [None]:
import gzip
import pandas as pd
import json
from collections import defaultdict
import string
from nltk.tokenize import wordpunct_tokenize
from sklearn import linear_model
import matplotlib.pyplot as plt

# 1.

Functions to parse data

In [None]:
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

Read in data

In [None]:
data = getDF('sample_data/Video_Games_5.json.gz')

Value counts of overall ratings before processing

In [None]:
data["overall"].value_counts()

5.0    299759
4.0     93654
3.0     49146
1.0     30883
2.0     24135
Name: overall, dtype: int64

Number of samples before processing

In [None]:
data.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,style,image
0,5.0,True,"10 17, 2015",A1HP7NVNPFMA4N,700026657,Ambrosia075,"This game is a bit hard to get the hang of, bu...",but when you do it's great.,1445040000,,,
1,4.0,False,"07 27, 2015",A1JGAP0185YJI6,700026657,travis,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it",1437955200,,,
2,3.0,True,"02 23, 2015",A1YJWEXHQBWK2B,700026657,Vincent G. Mezera,ok game.,Three Stars,1424649600,,,
3,2.0,True,"02 20, 2015",A2204E1TH211HT,700026657,Grandma KR,"found the game a bit too complicated, not what...",Two Stars,1424390400,,,
4,5.0,True,"12 25, 2014",A2RF5B5H74JLPE,700026657,jon,"great game, I love it and have played it since...",love this game,1419465600,,,


Drop vote, style, and image columns- they will not be used

In [None]:
data = data.drop(['reviewerID','asin','reviewerName','vote','style','image','unixReviewTime'],axis=1)

Drop all NaN

In [None]:
data = data[:50000]

Value counts of overall ratings after processing

In [None]:
data

Unnamed: 0,overall,verified,reviewTime,reviewText,summary
0,5.0,True,"10 17, 2015","This game is a bit hard to get the hang of, bu...",but when you do it's great.
1,4.0,False,"07 27, 2015",I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it"
2,3.0,True,"02 23, 2015",ok game.,Three Stars
3,2.0,True,"02 20, 2015","found the game a bit too complicated, not what...",Two Stars
4,5.0,True,"12 25, 2014","great game, I love it and have played it since...",love this game
...,...,...,...,...,...
49995,5.0,True,"02 6, 2013","Like all of it's predacessors, the last instal...",So Awesome it hurts
49996,5.0,True,"01 31, 2011",This game is excellent fun! It is a good old 8...,Classic Gradius at its best!
49997,5.0,True,"12 3, 2010","This is, hands down, one of the best shooters ...",Don't mess with Vic Viper....
49998,5.0,True,"09 22, 2010",many gradius games don't offer you unlockable ...,GRADIUS V


Number of samples after processing

In [None]:
data.shape[0]

50000

Has >50000 samples so we can use this dataset

Convert reviewTime from object to datetime

In [None]:
data["reviewTime"] = pd.to_datetime(data["reviewTime"], format="%m %d, %Y")

# 2.

In [None]:
#going to use tfidf, has stopwords included so don't need to implement ntlk

In [None]:
#don't think we really need reviewerID,asin reviewerName,since these are identifiers.

In [None]:
data

Unnamed: 0,overall,verified,reviewTime,reviewText,summary
0,5.0,True,2015-10-17,"This game is a bit hard to get the hang of, bu...",but when you do it's great.
1,4.0,False,2015-07-27,I played it a while but it was alright. The st...,"But in spite of that it was fun, I liked it"
2,3.0,True,2015-02-23,ok game.,Three Stars
3,2.0,True,2015-02-20,"found the game a bit too complicated, not what...",Two Stars
4,5.0,True,2014-12-25,"great game, I love it and have played it since...",love this game
...,...,...,...,...,...
49995,5.0,True,2013-02-06,"Like all of it's predacessors, the last instal...",So Awesome it hurts
49996,5.0,True,2011-01-31,This game is excellent fun! It is a good old 8...,Classic Gradius at its best!
49997,5.0,True,2010-12-03,"This is, hands down, one of the best shooters ...",Don't mess with Vic Viper....
49998,5.0,True,2010-09-22,many gradius games don't offer you unlockable ...,GRADIUS V


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import numpy as np

In [None]:
#clean text using regex,convert everything into strings so that there isn't a complilation error
def clean(text):
    text = re.sub(r'[^\w\s]', '', str(text)).lower()
    text = re.sub(r'\n','',str(text))
    return text

In [None]:
data['reviewText'] = data['reviewText'].apply(clean)
data['summary']=data['summary'].apply(clean)
data['verified']=data['verified'].apply(lambda x: 1 if x else 0)

In [None]:
data

Unnamed: 0,overall,verified,reviewTime,reviewText,summary
0,5.0,1,2015-10-17,this game is a bit hard to get the hang of but...,but when you do its great
1,4.0,0,2015-07-27,i played it a while but it was alright the ste...,but in spite of that it was fun i liked it
2,3.0,1,2015-02-23,ok game,three stars
3,2.0,1,2015-02-20,found the game a bit too complicated not what ...,two stars
4,5.0,1,2014-12-25,great game i love it and have played it since ...,love this game
...,...,...,...,...,...
49995,5.0,1,2013-02-06,like all of its predacessors the last installm...,so awesome it hurts
49996,5.0,1,2011-01-31,this game is excellent fun it is a good old 80...,classic gradius at its best
49997,5.0,1,2010-12-03,this is hands down one of the best shooters on...,dont mess with vic viper
49998,5.0,1,2010-09-22,many gradius games dont offer you unlockable t...,gradius v


In [None]:
#Seperate data into X and Y
X=data[['reviewText','summary']]
y=data['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#initialize vectorizers and model
tf_idf = TfidfVectorizer(stop_words='english')
linear_model = LogisticRegression(solver='lbfgs', max_iter=10000)

In [None]:
#create pipeline
column_transformer = ColumnTransformer(
    [('tfidf1', tf_idf, 'reviewText'), 
    ('tfidf2', tf_idf, 'summary')],
    remainder='passthrough')
pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', linear_model)
                ])

In [None]:
#fitting model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidf1',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'reviewText'),
                                                 ('tfidf2',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'summary')])),
                ('classify', LogisticRegression(max_iter=10000))])

In [None]:
y_pred = pipe.predict(X_test)

In [None]:
#decent f1 score, 65 isn't tha tbad
metrics.f1_score(y_pred,y_test,average='micro')

0.6509333333333334

In [None]:
#actual model, maybe take into account temporal values
fdata = getDF('sample_data/Video_Games_5.json.gz')
fdata=fdata[:50000]

In [None]:
#remove reviewerID,asin,reviewerName,unixreviewTime, we can just use reviewTime
fdata["reviewTime"] = pd.to_datetime(fdata["reviewTime"], format="%m %d, %Y")

In [None]:
fdata = fdata.drop(columns=['reviewerID','asin','reviewerName','vote','style','image','unixReviewTime'])

In [None]:
#clean data like previous model
fdata['reviewText'] = fdata['reviewText'].apply(clean)
fdata['summary']=fdata['summary'].apply(clean)
fdata['verified']=fdata['verified'].apply(lambda x: 1 if x else 0)

In [None]:
fdata['weekday']=fdata['reviewTime'].dt.weekday

In [None]:
#pipeline shit
#Seperate data into X and Y
X=fdata[['reviewText','summary','verified','weekday']]
y=fdata['overall']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
#initialize vectorizers and model
tf_idf = TfidfVectorizer(stop_words='english')
linear_model = LogisticRegression(solver='lbfgs', max_iter=10000)
column_transformer = ColumnTransformer(
    [('tfidf1', tf_idf, 'reviewText'), 
    ('tfidf2', tf_idf, 'summary')],
    remainder='passthrough')
pipe = Pipeline([
                  ('tfidf', column_transformer),
                  ('classify', linear_model)
                ])

In [None]:
#fitting model
pipe.fit(X_train,y_train)

Pipeline(steps=[('tfidf',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('tfidf1',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'reviewText'),
                                                 ('tfidf2',
                                                  TfidfVectorizer(stop_words='english'),
                                                  'summary')])),
                ('classify', LogisticRegression(max_iter=10000))])

In [None]:
y_pred = pipe.predict(X_test)
#decent f1 score, 65 isn't tha tbad
metrics.f1_score(y_pred,y_test,average='micro')

0.6519333333333334