In [2]:
import numpy as np
import random
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score

In [20]:
class Category:
    ELECTRONICS = "ELECTRONICS"
    BOOKS = "BOOKS"
    CLOTHING = "CLOTHING"
    GROCERY = "GROCERY"
    PATIO = "PATIO"
    
class Sentiment:
    POSITIVE = "POSITIVE"
    NEGATIVE = "NEGATIVE"
    NEUTRAL = "NEUTRAL"
    
class Review:
    def __init__(self, category, text, score):
        self.category = category
        self.text = text
        self.score = score
        self.sentiment = self.get_sentiment()
        
    def get_sentiment(self):
        if self.score <= 2:
            return Sentiment.NEGATIVE
        elif self.score == 3:
            return Sentiment.NEUTRAL
        else:
            return Sentiment.POSITIVE

class ReviewContainer:
    def __init__(self, reviews):
        self.reviews = reviews
    
    def get_text(self):
        return [x.text for x in self.reviews]
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_text())
    
    def get_y(self):
        return [x.sentiment for x in self.reviews]
    
    def get_category(self):
        return [x.category for x in self.reviews]
    
    def evenly_distribute(self):
        negative = list(filter(lambda x: x.sentiment == Sentiment.NEGATIVE, self.reviews))
        positive = list(filter(lambda x: x.sentiment == Sentiment.POSITIVE, self.reviews))
        positive_shrunk = positive[:len(negative)]
        print(len(positive_shrunk))
        self.reviews = negative + positive_shrunk
        random.shuffle(self.reviews)
        print(self.reviews[0])


import json
import random

file_names = ['./data/category/Electronics_small.json', './data/category/Books_small.json', './data/category/Clothing_small.json', './data/category/Grocery_small.json', './data/category/Patio_small.json']
file_categories = [Category.ELECTRONICS, Category.BOOKS, Category.CLOTHING, Category.GROCERY, Category.PATIO]

reviews = []
for i in range(len(file_names)):
    file_name = file_names[i]
    category = file_categories[i]
    with open(file_name) as f:
        for line in f:
            review_json = json.loads(line)
            review = Review(category, review_json['reviewText'], review_json['overall'])
            reviews.append(review)


from sklearn.model_selection import train_test_split

train, test = train_test_split(reviews, test_size = 0.30, random_state=42)

train_container = ReviewContainer(train)
#train_container.evenly_distribute()
test_container = ReviewContainer(test)
#test_container.evenly_distribute()

corpus = train_container.get_text()
#vectorizer = CountVectorizer(binary=True)
#vectorizer.fit(corpus)
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)

train_x = train_container.get_x(vectorizer)
train_y = train_container.get_category()

test_x = test_container.get_x(vectorizer)
test_y = test_container.get_category()

In [54]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(train_x, train_y)
print(neigh.score(test_x,test_y))
y_pred = neigh.predict(test_x)
print(f1_score(test_y, y_pred, average=None))

0.8826666666666667
[0.93311582 0.85761047 0.875      0.8705036  0.87581699]


In [56]:
import pickle

with open('./models/category_classifier.pkl', 'wb') as f:
    pickle.dump(neigh, f)
    
with open('./models/category_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [44]:
import pickle 

with open('./models/category_classifier.pkl', 'rb') as f:
    gnbb = pickle.load(f)

with open('./models/category_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [49]:
user = input("your input: ")

test_set = [user]
new_test = vectorizer.transform(test_set)

gnbb.predict(new_test.toarray())

array(['CLOTHING'], dtype='<U11')