# Predictor Matrix Generation {-}
## EE627 {-}
### Connor DePalma & Daniel Pelis - Ground Truth {-}

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import IntegerType, StructType, StructField, StringType
import pyspark.sql.functions as sf
import pandas as pd

In [3]:
training = spark.read.csv("trainItem.data", header = False)
testing_gt = spark.read.option("delimiter","|").csv("test2_new.txt", header = False)

schema = StructType([
    StructField("UserID", StringType(), True),
    StructField("TrackID", StringType(), True),
    StructField("AlbumID", StringType(), True),
    StructField("ArtistID", StringType(), True),
    StructField("GenreID_1", StringType(), True),
    StructField("GenreID_2", StringType(), True),
    StructField("GenreID_3", StringType(), True),
    StructField("GenreID_4", StringType(), True),
    StructField("GenreID_5", StringType(), True),
    StructField("GenreID_6", StringType(), True),
    StructField("GenreID_7", StringType(), True),
    StructField("GenreID_8", StringType(), True),
    StructField("GenreID_9", StringType(), True),
    StructField("GenreID_10", StringType(), True),
    StructField("GenreID_11", StringType(), True),
    StructField("GenreID_12", StringType(), True),
    StructField("GenreID_13", StringType(), True),
    StructField("GenreID_14", StringType(), True),
    StructField("GenreID_15", StringType(), True),
    StructField("GenreID_16", StringType(), True),
])
testing_hr = spark.read.option("delimiter","|").csv("testTrack_hierarchy.txt", header = False, schema=schema)
training.show(5)

+------+------+---+
|   _c0|   _c1|_c2|
+------+------+---+
|199808|248969| 90|
|199808|  2663| 90|
|199808| 28341| 90|
|199808| 42563| 90|
|199808| 59092| 90|
+------+------+---+
only showing top 5 rows



In [3]:
training = training.withColumnRenamed("_c0", "userID").withColumnRenamed("_c1", "itemID").withColumnRenamed("_c2", "rating")
training.show(5)

+------+------+------+
|userID|itemID|rating|
+------+------+------+
|199808|248969|    90|
|199808|  2663|    90|
|199808| 28341|    90|
|199808| 42563|    90|
|199808| 59092|    90|
+------+------+------+
only showing top 5 rows



In [4]:
training = training.withColumn("userID", training["userID"].cast(IntegerType()))
training = training.withColumn("itemID", training["itemID"].cast(IntegerType()))
training = training.withColumn("rating", training["rating"].cast('float'))
training.show(3)

+------+------+------+
|userID|itemID|rating|
+------+------+------+
|199808|248969|  90.0|
|199808|  2663|  90.0|
|199808| 28341|  90.0|
+------+------+------+
only showing top 3 rows



In [5]:
testing_gt = testing_gt.withColumnRenamed("_c0", "userID").withColumnRenamed("_c1", "itemID").withColumnRenamed("_c2", "rating")
testing_gt.show(5)

+------+------+------+
|userID|itemID|rating|
+------+------+------+
|200031| 30877|     1|
|200031|  8244|     1|
|200031|130183|     0|
|200031|198762|     0|
|200031| 34503|     1|
+------+------+------+
only showing top 5 rows



In [6]:
testing_gt = testing_gt.withColumn("userID", testing_gt["userID"].cast(IntegerType()))
testing_gt = testing_gt.withColumn("itemID", testing_gt["itemID"].cast(IntegerType()))
testing_gt = testing_gt.withColumn("rating", testing_gt["rating"].cast('float'))
testing_gt.show(3)

+------+------+------+
|userID|itemID|rating|
+------+------+------+
|200031| 30877|   1.0|
|200031|  8244|   1.0|
|200031|130183|   0.0|
+------+------+------+
only showing top 3 rows



In [7]:
testing_hr = testing_hr.withColumn("UserID", testing_hr["UserID"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("TrackID", testing_hr["TrackID"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("AlbumID", testing_hr["AlbumID"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("ArtistID", testing_hr["ArtistID"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_1", testing_hr["GenreID_1"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_2", testing_hr["GenreID_2"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_3", testing_hr["GenreID_3"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_4", testing_hr["GenreID_4"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_5", testing_hr["GenreID_5"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_6", testing_hr["GenreID_6"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_7", testing_hr["GenreID_7"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_8", testing_hr["GenreID_8"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_9", testing_hr["GenreID_9"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_10", testing_hr["GenreID_10"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_11", testing_hr["GenreID_11"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_12", testing_hr["GenreID_12"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_13", testing_hr["GenreID_13"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_14", testing_hr["GenreID_14"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_15", testing_hr["GenreID_15"].cast(IntegerType()))
testing_hr = testing_hr.withColumn("GenreID_16", testing_hr["GenreID_16"].cast(IntegerType()))

In [8]:
users = sorted([i.userID for i in testing_gt.select('userID').distinct().collect()])

In [9]:
users[:10]

[200031,
 200032,
 200055,
 200065,
 200070,
 200074,
 200085,
 200099,
 200106,
 200118]

In [10]:
testing_hr_bet = testing_hr.filter(testing_hr.UserID.isin(users)).toPandas()

In [11]:
testing_gt_bet = testing_gt.toPandas()

In [12]:
training_bet = training.filter(training.userID.isin(users)).toPandas()

In [13]:
testing_hr_bet

Unnamed: 0,UserID,TrackID,AlbumID,ArtistID,GenreID_1,GenreID_2,GenreID_3,GenreID_4,GenreID_5,GenreID_6,GenreID_7,GenreID_8,GenreID_9,GenreID_10,GenreID_11,GenreID_12,GenreID_13,GenreID_14,GenreID_15,GenreID_16
0,200031,30877,192723.0,132319.0,131552.0,176858.0,218185.0,251593.0,266073.0,67098.0,,,,,,,,,,
1,200031,8244,223220.0,233697.0,131552.0,176858.0,218185.0,75691.0,67098.0,,,,,,,,,,,
2,200031,130183,,,139095.0,242383.0,,,,,,,,,,,,,,
3,200031,198762,220103.0,113265.0,131552.0,47898.0,201738.0,88853.0,,,,,,,,,,,,
4,200031,34503,43738.0,173170.0,131552.0,199606.0,51420.0,181006.0,67186.0,67331.0,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,212234,137371,174016.0,178205.0,199606.0,105160.0,289568.0,48717.0,,,,,,,,,,,,
5996,212234,42375,105869.0,24208.0,61215.0,17453.0,274088.0,199606.0,212410.0,88853.0,51420.0,233978.0,67331.0,,,,,,,
5997,212234,277867,213269.0,236546.0,131552.0,173467.0,19913.0,48505.0,133159.0,,,,,,,,,,,
5998,212234,83093,284629.0,236546.0,131552.0,173467.0,146792.0,48505.0,154024.0,133159.0,196528.0,,,,,,,,,


In [14]:
testing_gt_bet

Unnamed: 0,userID,itemID,rating
0,200031,30877,1.0
1,200031,8244,1.0
2,200031,130183,0.0
3,200031,198762,0.0
4,200031,34503,1.0
...,...,...,...
5995,212234,137371,0.0
5996,212234,42375,0.0
5997,212234,277867,1.0
5998,212234,83093,1.0


In [15]:
training_bet

Unnamed: 0,userID,itemID,rating
0,200031,17678,90.0
1,200031,33269,90.0
2,200031,34486,80.0
3,200031,70401,90.0
4,200031,82317,90.0
...,...,...,...
566257,212234,270460,90.0
566258,212234,213269,90.0
566259,212234,3204,90.0
566260,212234,123568,90.0


In [16]:
predictor = []
for user in users:
    user_train = training_bet.loc[training_bet["userID"]==user]
    user_test = testing_hr_bet.loc[testing_hr_bet["UserID"]==user].drop(columns=["UserID","TrackID"])
    itemID = list(user_train.itemID.values)
    rating = list(user_train.rating.values)
    for idx, item in enumerate(itemID):
        user_test.replace(to_replace=item,value=rating[idx],inplace=True)
    predictor.append(user_test)
predictor = pd.concat(predictor, axis=0).reset_index(drop=True)

In [17]:
predictor[predictor>100] = None

In [18]:
predictor

Unnamed: 0,AlbumID,ArtistID,GenreID_1,GenreID_2,GenreID_3,GenreID_4,GenreID_5,GenreID_6,GenreID_7,GenreID_8,GenreID_9,GenreID_10,GenreID_11,GenreID_12,GenreID_13,GenreID_14,GenreID_15,GenreID_16
0,90.0,50.0,90.0,80.0,,,,,,,,,,,,,,
1,90.0,,90.0,80.0,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,
3,,,90.0,,,,,,,,,,,,,,,
4,90.0,50.0,90.0,80.0,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,,,,,,,,,,,,,,,,,,
5996,,,,,,,,,,,,,,,,,,
5997,90.0,90.0,,,,,,,,,,,,,,,,
5998,90.0,90.0,,,,,,,,,,,,,,,,


In [19]:
numGenre = predictor.iloc[:,2:].count(axis=1)
avgGenre = predictor.iloc[:,2:].mean(axis=1)
maxGenre = predictor.iloc[:,2:].max(axis=1)
minGenre = predictor.iloc[:,2:].min(axis=1)
varGenre = predictor.iloc[:,2:].var(axis=1)
medGenre = predictor.iloc[:,2:].median(axis=1)

  overwrite_input=overwrite_input)


In [20]:
genreStat = pd.DataFrame()
genreStat['num_genre'] = numGenre
genreStat['max'] = maxGenre
genreStat['min'] = minGenre
genreStat['mean'] = avgGenre
genreStat['variance'] = varGenre
genreStat['median'] = medGenre

In [21]:
genreStat.loc[genreStat['num_genre'].idxmax()]

num_genre     12.000000
max           70.000000
min            0.000000
mean          24.166667
variance     590.151515
median        30.000000
Name: 840, dtype: float64

In [22]:
genreStat.iloc[841]

num_genre      2.0
max           30.0
min            0.0
mean          15.0
variance     450.0
median        15.0
Name: 841, dtype: float64

In [23]:
predictor.iloc[841,:]

AlbumID        NaN
ArtistID       0.0
GenreID_1      0.0
GenreID_2     30.0
GenreID_3      NaN
GenreID_4      NaN
GenreID_5      NaN
GenreID_6      NaN
GenreID_7      NaN
GenreID_8      NaN
GenreID_9      NaN
GenreID_10     NaN
GenreID_11     NaN
GenreID_12     NaN
GenreID_13     NaN
GenreID_14     NaN
GenreID_15     NaN
GenreID_16     NaN
Name: 841, dtype: float64

In [24]:
genreStat

Unnamed: 0,num_genre,max,min,mean,variance,median
0,2,90.0,80.0,85.0,50.0,85.0
1,2,90.0,80.0,85.0,50.0,85.0
2,0,,,,,
3,1,90.0,90.0,90.0,,90.0
4,2,90.0,80.0,85.0,50.0,85.0
...,...,...,...,...,...,...
5995,0,,,,,
5996,0,,,,,
5997,0,,,,,
5998,0,,,,,


In [25]:
pred = pd.DataFrame()
pred['userID'] = testing_hr_bet.UserID
pred['trackID'] = testing_hr_bet.TrackID
pred['recommendation'] = testing_gt_bet.loc[:,"rating"]
pred['album'] = predictor.AlbumID
pred['artist'] = predictor.ArtistID
pred = pd.concat([pred, genreStat], axis=1)

In [40]:
# values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}

pred = pred.fillna(0)

In [41]:
pred.to_csv("./predictor_response_improved.csv",index=False)

In [29]:
from sklearn.impute import SimpleImputer
import numpy as np
imp = SimpleImputer(missing_values=np.nan, strategy='mean')

In [30]:
imp = imp.fit(pred)

In [37]:
pd.DataFrame(imp.transform(pred))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,200031.0,30877.0,1.0,90.000000,50.00000,2.0,90.000000,80.000000,85.000000,50.000000,85.000000
1,200031.0,8244.0,1.0,90.000000,80.63218,2.0,90.000000,80.000000,85.000000,50.000000,85.000000
2,200031.0,130183.0,0.0,82.176093,80.63218,0.0,72.836471,66.890516,69.845595,449.384434,69.858089
3,200031.0,198762.0,0.0,82.176093,80.63218,1.0,90.000000,90.000000,90.000000,449.384434,90.000000
4,200031.0,34503.0,1.0,90.000000,50.00000,2.0,90.000000,80.000000,85.000000,50.000000,85.000000
...,...,...,...,...,...,...,...,...,...,...,...
5995,212234.0,137371.0,0.0,82.176093,80.63218,0.0,72.836471,66.890516,69.845595,449.384434,69.858089
5996,212234.0,42375.0,0.0,82.176093,80.63218,0.0,72.836471,66.890516,69.845595,449.384434,69.858089
5997,212234.0,277867.0,1.0,90.000000,90.00000,0.0,72.836471,66.890516,69.845595,449.384434,69.858089
5998,212234.0,83093.0,1.0,90.000000,90.00000,0.0,72.836471,66.890516,69.845595,449.384434,69.858089
