Define mining classes and functions

In [1]:
import numpy
import pandas as pd
import datetime
import time
import random

import sys
import csv

maxInt = sys.maxsize
decrement = True

while decrement:
    decrement = False
    try:
        csv.field_size_limit(maxInt)
    except OverflowError:
        maxInt = int(maxInt/10)
        decrement = True

In [2]:
features = ['user_average_stars', 'user_review_count', 'user_useful', 'business_price_range', 'business_review_count', 'business_stars']
print_mod = 1000
# over 150000 rows in train_reviews
train_data_size = 15000

In [3]:
# retrieve the current date
current_date = datetime.datetime.now()

In [4]:
class User():
    def __init__(self, average_stars, review_count, useful):
        self.average_stars = average_stars
        self.review_count = review_count
        self.useful = useful

class Business():
    def __init__(self, price_range, review_count, stars):
        self.price_range = price_range
        self.review_count = review_count
        self.stars = stars

In [5]:
def get_users():
    with open('users.csv', newline='') as csv_file:
        csv_reader = csv.reader(csv_file)
        next(csv_reader)
        users = {}
        for row in csv_reader:
            average_stars = row[0]
            review_count = row[18]
            useful = row[19]
            user_id = row[20]
            if not user_id in users:
                users[user_id] = User(average_stars, review_count, useful)
        print('collected users')
        return users

In [6]:
def get_businesses():
    with open('business.csv', newline='') as csv_file:
        csv_reader = csv.reader(csv_file)
        next(csv_reader)
        businesses = {}
        for row in csv_reader:
            if row[34]:
                price_range = row[34]
            else:
                price_range = random.randint(1,4)
            business_id = row[41]
            review_count = row[58]
            stars = row[59]
            if not business_id in businesses:
                businesses[business_id] = Business(price_range, review_count, stars)
        print('collected businesses')
        return businesses

In [7]:
def get_train_data(users, businesses):
    # open train_reviews.csv
    train_reviews_df = pd.read_csv("train_reviews.csv", header=0)

    # delete columns: review_id, text, cool, date, funny, useful
    del train_reviews_df['review_id'], train_reviews_df['text'], train_reviews_df['cool'], train_reviews_df['date'], train_reviews_df['funny'], train_reviews_df['useful']

    # truncate rows
    train_reviews_df = train_reviews_df.truncate(after=train_data_size)

    # create empty x_train with named rows
    x_train = pd.DataFrame(columns=features, dtype=float)

    # convert user_id and business_id using maps
    train_index = 0
    start = time.time()
    for index, row in train_reviews_df.iterrows():
        if not train_index % print_mod and train_index:
            print('total rows:', train_index, 'last iteration:', '{0:.2f}'.format(time.time() - start))
            start = time.time()

        x_train.loc[index] = [users[row['user_id']].average_stars, users[row['user_id']].review_count, users[row['user_id']].useful, businesses[row['business_id']].price_range, businesses[row['business_id']].review_count, businesses[row['business_id']].stars]
        train_index += 1

    # split train_reviews_df into features and training
    y_train = train_reviews_df.stars

    print('collected train data')
    
    return x_train, y_train

Create objects and collect data.

In [8]:
users = get_users()

collected users


In [9]:
businesses = get_businesses()

collected businesses


In [10]:
x_train, y_train = get_train_data(users, businesses)

total rows: 1000 last iteration: 1.91
total rows: 2000 last iteration: 2.40
total rows: 3000 last iteration: 3.34
total rows: 4000 last iteration: 2.61
total rows: 5000 last iteration: 2.07
total rows: 6000 last iteration: 2.12
total rows: 7000 last iteration: 2.26
total rows: 8000 last iteration: 2.33
total rows: 9000 last iteration: 2.47
total rows: 10000 last iteration: 2.58
total rows: 11000 last iteration: 2.75
total rows: 12000 last iteration: 3.64
total rows: 13000 last iteration: 4.40
total rows: 14000 last iteration: 4.90
total rows: 15000 last iteration: 4.72
collected train data


Create the net funtion.

In [11]:
from sklearn.neural_network import MLPClassifier

In [12]:
def get_net(x_train, y_train):
    # instantiate the model
    net = MLPClassifier(hidden_layer_sizes=(6,2), learning_rate_init=1e-3)

    # fit the model with data
    net.fit(x_train, y_train)

    print('created net')

    return net

Collect validation data.

In [14]:
validate_queries_size = 1000

In [15]:
def run_validate_queries(model):
    # open validate_queries
    validate_queries_df = pd.read_csv("validate_queries.csv", header=0)

    # truncate rows
    validate_queries_df = validate_queries_df.truncate(after=validate_queries_size)

    # delete columns: unnamed
    del validate_queries_df['Unnamed: 0']

    # create empty x_test with named rows
    x_test = pd.DataFrame(columns=features, dtype=float)

    train_index = 0
    start = time.time()
    for index, row in validate_queries_df.iterrows():
        if not train_index % print_mod and train_index:
            print('total rows:', train_index, 'last iteration:', '{0:.2f}'.format(time.time() - start))
            start = time.time()

        x_test.loc[index] = [users[row['user_id']].average_stars, users[row['user_id']].review_count, users[row['user_id']].useful, businesses[row['business_id']].price_range, businesses[row['business_id']].review_count, businesses[row['business_id']].stars]
        train_index += 1

    # extract stars from validate_queries
    y_test = validate_queries_df.stars
    
    return x_test, y_test
    

In [16]:
x_test, y_test = run_validate_queries(net)

total rows: 1000 last iteration: 1.97


Create and score a net.

In [17]:
net = get_net(x_train, y_train)
print('mean accuracy score: {:.2%}'.format(net.score(x_test, y_test)))

created net
mean accuracy score: 27.87%


Run net on validation data.

In [18]:
# predict on testing set
y_pred = net.predict(x_test)

square_error = 0
for index, row in y_test.iteritems():
    square_error += numpy.square(row - y_pred[index])

rmse = numpy.sqrt(square_error / len(y_pred))
print('ran on validate queries, rmse:', rmse)

ran on validate queries, rmse: 1.2666473875533018
