Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
144 lines (123 sloc) 4.42 KB
DECLARE @INPUTDIR string = "/movies/ml-1m/";
DECLARE @RATINGSInp string = @INPUTDIR + "ratings.dat";
DECLARE @USERSInp string = @INPUTDIR + "users.dat";
// Ratings in the format MovieID::UserID::Rating::Timestamp
@RatingsRaw =
EXTRACT RatingLine string
FROM @RATINGSInp
USING Extractors.Tsv( rowDelimiter: "\n");
// Split Ratings into Array using String.Split
// Notice that a >1 char seperator has to be passed in an array
@RatingsSplit =
SELECT RatingLine,new SQL.ARRAY<string>(
RatingLine.Split(new string[]{"::"},StringSplitOptions.None)) AS Ratings
FROM @RatingsRaw;
// Type casting and column rename
@Ratings =
SELECT int.Parse(Ratings[0]) AS MovieID,
int.Parse(Ratings[1]) AS UserID,
int.Parse(Ratings[2]) AS Rating,
Ratings[3] AS Timestamp
FROM @RatingsSplit;
// Average Movie Rating
@AvgRating =
SELECT MovieID,
AVG(Rating) AS AvgRating
FROM @Ratings
GROUP BY MovieID;
// Users into format UserID::Gender::Age::Occupation::Zip
@UsersRaw =
EXTRACT UserLine string
FROM @USERSInp
USING Extractors.Tsv( rowDelimiter: "\n");
// Split into array
@UsersSplit =
SELECT UserLine,new SQL.ARRAY<string>(
UserLine.Split(new string[]{"::"},StringSplitOptions.None)) AS UserArr
FROM @UsersRaw;
// Type casting and column renaming
@Users =
SELECT int.Parse(UserArr[0]) AS UserID,
UserArr[1] AS Gender,
int.Parse(UserArr[2]) AS Age,
int.Parse(UserArr[3]) AS Occupation,
UserArr[4] AS Zip
FROM @UsersSplit;
// Join Users to Users on Age Range and Occupation
@CrossJoin =
SELECT a.UserID AS aUserID,
b.UserID AS bUserID
FROM @Users AS a
JOIN
@Users AS b
ON a.Age == b.Age
AND a.Occupation == b.Occupation
WHERE a.UserID != b.UserID; // Inequality cannot be part of the join but ok as a filter
// Add movies that User A has seen but User B hasnt seen
@CrossJoinWithMovies =
SELECT aUserID,
bUserID,
Ratings.MovieID AS UnseenMovieID
FROM @CrossJoin AS cj
JOIN
@Ratings AS Ratings
ON cj.bUserID == Ratings.UserID
LEFT JOIN
@Ratings AS aUserRatings
ON cj.aUserID == aUserRatings.UserID
AND aUserRatings.MovieID == Ratings.MovieID
WHERE aUserRatings.UserID == NULL;
// Cosine Calc - Step 1
@CrossJoinRatings1 =
SELECT aUserID,
bUserID,
r1.MovieID,
r1.Rating AS aRating,
r2.Rating AS bRating,
Math.Pow(r1.Rating, 2) AS aRatingSQ,
Math.Pow(r2.Rating, 2) AS bRatingSQ
FROM @CrossJoin AS cj
JOIN
@Ratings AS r1
ON cj.aUserID == r1.UserID
JOIN
@Ratings AS r2
ON cj.bUserID == r2.UserID
AND r1.MovieID == r2.MovieID;
// Cosine Calc - Step 2
@cj1 =
SELECT aUserID,
bUserID,
SUM(aRating * bRating) AS r1,
SUM(aRatingSQ) AS SumASquared,
SUM(bRatingSQ) AS SumBSquared
FROM @CrossJoinRatings1
GROUP BY aUserID,
bUserID;
// Cosine Calc - Step 3 - Similarity Ranges from 0.000 ( Not similar) to 1.000 ( Exact match)
@Similarity =
SELECT aUserID,
bUserID,
(double) r1 / (Math.Sqrt((double) SumASquared) * Math.Sqrt((double) SumBSquared)) AS Similarity
FROM @cj1 AS cj;
// Final Recommendations
// Max String is in the Format of Similarity Match,Average Movie Rating,Movie Recommended
@Rec =
SELECT Similarity.aUserID,
MAX(Similarity.Similarity.ToString("0.0000") + "," + ar.AvgRating.ToString() + "," + ar.MovieID.ToString()) AS MaxString
FROM @Similarity AS Similarity
JOIN
@Ratings AS r
ON r.UserID == Similarity.bUserID
JOIN
@AvgRating AS ar
ON ar.MovieID == r.MovieID
JOIN
@CrossJoinWithMovies AS crjMovies
ON crjMovies.aUserID == Similarity.aUserID AND crjMovies.bUserID == Similarity.bUserID AND ar.MovieID == crjMovies.UnseenMovieID
WHERE Similarity < 1 AND Similarity >= .95 // We do not want exact matches and assumption is everyone is vagually similar to someone
GROUP BY Similarity.aUserID;
// Output final file
OUTPUT @Rec
TO "Recommend.csv"
USING Outputters.Csv();