# Imports

In [109]:
import requests
from bs4 import BeautifulSoup as bs
import re
import json
import numpy as np
import pandas as pd
import zipfile
import sys
import requests
import nltk
from nltk.sentiment import SentimentAnalyzer
sa = SentimentAnalyzer()
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
sid = SentimentIntensityAnalyzer()
from selenium.webdriver import Firefox
import random
import time
import warnings
import pymongo
import pickle
from functools import reduce
from selenium.common.exceptions import TimeoutException
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.tuning import CrossValidator
from pyspark.sql.types import (
    IntegerType, StringType, IntegerType, FloatType, 
    StructField, StructType, DoubleType)
from gc_functions import (get_keys, launch_spotipy, scrape_explore_page, build_most_popular, 
                          build_highest_rated, combine_and_remove_duplicates, print_sentiments, 
                          extract_user_comments, get_all_comments, get_tab_idxs, new_user_predict, 
                          check_db_size, sleep, make_url_list, get_data, get_comments, load_commentlist, 
                          store_commentlist, scrape_comments, download_mongodb, assign_id_numbers)

# Initialize Mongodb, Spotipy

In [2]:
mongostring, myclientid, myclientsecret = get_keys()
mc = pymongo.MongoClient(mongostring)
sp = launch_spotipy(myclientid, myclientsecret)

# Build tables for most popular and highest rated tabs

In [3]:
#ratedtable = build_highest_rated()
#ratedtable.shape
#ratedtable.to_csv('ratedtable.csv', sep='~')
ratedtable = pd.read_csv('ratedtable.csv', sep='~', usecols = ['artist_name', 'artist_url', 'date', 'id', 'part', 'preset_id',
       'rating', 'recording', 'song_name', 'status', 'tab_access_type',
       'tab_url', 'tonality_name', 'tp_version', 'type', 'type_name',
       'verified', 'version', 'version_description', 'votes'])

In [4]:
#poptable = build_most_popular()
#poptable.shape
#poptable.to_csv('poptable.csv', sep='~')
poptable = pd.read_csv('poptable.csv', sep='~', usecols = ['artist_name', 'artist_url', 'date', 'id', 'part', 'preset_id',
       'rating', 'recording', 'song_name', 'status', 'tab_access_type',
       'tab_url', 'tonality_name', 'tp_version', 'type', 'type_name',
       'verified', 'version', 'version_description', 'votes'])

# Combine tables and extract urllist

In [5]:
combinedtable = combine_and_remove_duplicates(poptable, ratedtable)
combinedtable.shape

(1429, 20)

In [6]:
urllist = make_url_list(combinedtable)

# Autoscraping

In [7]:
#get_data(urllist, mc)

# Check Database Size

In [8]:
check_db_size(mc)

540

# Download Database

In [9]:
mydb = download_mongodb(mc)
len(mydb)

541

# Comment Extraction, Ratings List

In [10]:
ratingslist = extract_user_comments(mydb)
len(ratingslist)

30724

# Ratings Dataframe

In [11]:
ratingsdf = pd.DataFrame(ratingslist, columns = ['user', 'tab', 'rating'])
ratingsdf.shape

(30724, 3)

In [12]:
ratingsdf.to_csv('ratingsdf.csv', sep='~')

In [16]:
ratingsdf = pd.read_csv('ratingsdf.csv', sep='~', usecols = ['user', 'tab', 'rating'])
ratingsdf.shape

(30724, 3)

# Assign Numeric ID Numbers

In [17]:
idratingsdf = assign_id_numbers(ratingsdf)
idratingsdf.head()

Unnamed: 0,user,tab,rating
0,200001,313000,0.0
1,200002,313000,0.7096
2,200003,313000,0.7096
3,200004,313000,0.3296
4,200005,313000,0.3296


In [52]:
idratingsdf.to_csv('idratingsdf.csv', sep='~', header = False)
idratingsdf.shape

(30724, 3)

In [53]:
idratingsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30724 entries, 0 to 30723
Data columns (total 3 columns):
user      30724 non-null object
tab       30724 non-null object
rating    30724 non-null float64
dtypes: float64(1), object(2)
memory usage: 720.2+ KB


# ALS Model

In [68]:
ratings_schema = StructType([
    StructField('index', IntegerType(), False),
    StructField('user', IntegerType(), False),
    StructField('tab', IntegerType(),False),
    StructField('rating', FloatType(), False)
])

In [76]:
ratings_data = spark.read.csv('idratingsdf.csv', sep='~', schema=ratings_schema).persist()

In [88]:
ratings_data2 = ratings_data.filter(~reduce(lambda x, y: x & y, [ratings_data[c].isNull() for c in ratings_data.columns]))
ratings_data2.count()

In [92]:
ratings_df = spark.createDataFrame(ratings_data2.drop(ratings_data.index).collect())

In [94]:
train, test = ratings_df.randomSplit([0.8, 0.2], seed=55)

In [95]:
als_model = ALS(
    itemCol='tab',
    userCol='user',
    ratingCol='rating', 
    regParam=0.05,
    rank=10,
    coldStartStrategy='drop') 

In [96]:
tab_rec = als_model.fit(train)

In [102]:
tab_rec.save("tab_rec_model")

In [104]:
!ls tab_rec_model/

[34mitemFactors[m[m [34mmetadata[m[m    [34muserFactors[m[m


In [110]:
tab_rec_test = ALSModel.load("tab_rec_model")

In [112]:
test_preds_test = tab_rec_test.transform(test)
test_preds_test.show(1)

+------+------+-------------------+-----------+
|  user|   tab|             rating| prediction|
+------+------+-------------------+-----------+
|200010|313003|-0.5257999897003174|-0.08778465|
+------+------+-------------------+-----------+
only showing top 1 row



In [97]:
train_preds = tab_rec.transform(train)
#train_preds.show()

In [98]:
test_preds = tab_rec.transform(test)
#test_preds.show()

# Cold Start

In [None]:
#user_factors = tab_rec.userFactors.collect()
#user_factors_df = tab_rec.userFactors.toPandas()
#user_factors_arr = np.array(user_factors_df['features'].tolist())
#user_factors_arr.shape

In [None]:
#item_factors = tab_rec.itemFactors.collect()
#item_factors_df = tab_rec.itemFactors.toPandas()
#item_factors_arr = np.array(item_factors_df['features'].tolist())
#item_factors_arr.shape

# Insert Code to generate initial ratings via survey

In [None]:
#user_ratings = """train[train['user']==201598]"""   <==== These ratings will be generated via survey

In [None]:
#tab_idxs = get_tab_idxs(user_ratings_df, item_factors_df)

In [None]:
#item_factors_arr[tab_idxs]
#ratings = user_ratings_df['rating']
#ratings.values.reshape(-1,1)

In [None]:
#X, residuals, rank, s = np.linalg.lstsq(item_factors_arr[tab_idxs], ratings.values, rcond=None)

In [None]:
#newuser_factors = X
#new_user_df = new_user_predict(newuser_factors, item_factors_arr)
#new_user_df

# Pickling

In [101]:
import pickle

In [113]:
user_factors_df = tab_rec.userFactors.toPandas()


In [114]:
user_factors_df.to_pickle('user_factors.pkl')

In [115]:
user_factors_df_test = pd.read_pickle('user_factors.pkl')

In [116]:
user_factors_df_test

Unnamed: 0,id,features
0,200010,"[-0.17132942378520966, -0.2985149621963501, 0...."
1,200020,"[0.3393862843513489, 0.18049797415733337, -0.0..."
2,200030,"[0.08033595234155655, 0.2997550666332245, 0.36..."
3,200040,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,200050,"[0.09635399281978607, 0.011810912750661373, 0...."
5,200060,"[-0.3788469731807709, 0.13050466775894165, 0.2..."
6,200070,"[0.06744886934757233, 0.008267770521342754, 0...."
7,200080,"[-0.15907622873783112, -0.03999718278646469, 0..."
8,200090,"[-0.6588613390922546, 0.3464178144931793, 0.27..."
9,200110,"[0.024425795301795006, 0.0029940735548734665, ..."
