In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import datetime as dt
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.cluster import KMeans
from sklearn.linear_model import SGDRegressor
import threading

In [None]:
ratings_df = pd.read_csv("data/movie_lense/ratings.csv")
ratings_df['timestamp'] = pd.to_datetime(ratings_df['timestamp'], unit = 's')
movies_df = pd.read_csv("data/movie_lense/movies.csv")
pattern = r'(\([0-9][0-9][0-9][0-9]\))'
a = movies_df['title'].str.contains(pattern)
movies_df['release'] = movies_df['title'].str.extract(pattern, expand=True)
movies_df['release'] = movies_df['release'].str.replace('(', "")
movies_df['release'] = movies_df['release'].str.replace(')', "")
movies_df['title'] = movies_df['title'].str.replace(pattern, "")
movies_df['release'] = pd.to_numeric(movies_df['release'])
# movies_df['release'] = pd.to_datetime(movies_df['release'], format = '%Y')
# movies_df['release'] = movies_df['release'].dt.year
movies_df['title'] = movies_df['title'].str.rstrip()

def month_diff(a, b):
    if a >= b:
        earlier = b
        later = a
    else:
        earlier = a
        later = b
    year_diff = (later.year - earlier.year)*12
    month_diff = (later.month - earlier.month)*(later.month >= earlier.month) + (later.month < earlier.month)*(- earlier.month + later.month)
    return year_diff+month_diff

def onehotencode(movies_df):

    def splitColumn(dataframe, column_name, delimiter):
        new = dataframe[column_name].str.split(delimiter, expand=True)
        return new

    new = splitColumn(movies_df, "genres", "|")
    movies_df = movies_df.assign(first_genre=new[0], second_genre=new[1], third_genre=new[2], fourth_genre=new[3], fifth_genre=new[4],
                                                       sixth_genre=new[5], seventh_genre=new[6])
    movies_df = movies_df.drop(labels=["genres"], axis=1)
    y = pd.get_dummies(movies_df[["first_genre", "second_genre", "third_genre", "fourth_genre", "fifth_genre", "sixth_genre", "seventh_genre"]])

    mapping = {}
    def makeMapping(y):
      for i in range(80):
        if "Action" in y.columns[i]:
          mapping.update({y.columns[i]: "Action"})
        if "Adventure" in y.columns[i]:
          mapping.update({y.columns[i]: "Adventure"})
        if "Animation" in y.columns[i]:
          mapping.update({y.columns[i]: "Animation"})
        if "Children" in y.columns[i]:
          mapping.update({y.columns[i]: "Children"})
        if "Comedy" in y.columns[i]:
          mapping.update({y.columns[i]: "Comedy"})
        if "Crime" in y.columns[i]:
          mapping.update({y.columns[i]: "Crime"})
        if "Documentary" in y.columns[i]:
          mapping.update({y.columns[i]: "Documentary"})
        if "Drama" in y.columns[i]:
          mapping.update({y.columns[i]: "Drama"})
        if "Fantasy" in y.columns[i]:
          mapping.update({y.columns[i]: "Fantasy"})
        if "Horror" in y.columns[i]:
          mapping.update({y.columns[i]: "Horror"})
        if "Musical" in y.columns[i]:
          mapping.update({y.columns[i]: "Musical"})
        if "Mystery" in y.columns[i]:
          mapping.update({y.columns[i]: "Mystery"})
        if "Romance" in y.columns[i]:
          mapping.update({y.columns[i]: "Romance"})
        if "Sci-Fi" in y.columns[i]:
          mapping.update({y.columns[i]: "Sci-Fi"})
        if "Thriller" in y.columns[i]:
          mapping.update({y.columns[i]: "Thriller"})
        if "Western" in y.columns[i]:
          mapping.update({y.columns[i]: "Western"})

    makeMapping(y)
    y = y.set_index("first_genre_(no genres listed)").groupby(mapping, axis=1).sum()
    movies_df = movies_df.drop(["first_genre", "second_genre", "third_genre", "fourth_genre", "fifth_genre", "sixth_genre", "seventh_genre"], axis=1)
    y.reset_index(drop=True, inplace=True)
    concat = pd.concat([movies_df, y], axis=1)
    return concat

In [None]:
movies_df = onehotencode(movies_df)

In [None]:
def movie_ols(movieID, ratings = ratings_df, print = False):
    movie_rating_df = ratings_df[ratings_df["movieId"] == movieID]
    movie_ratings_grouped = movie_rating_df.groupby([movie_rating_df['timestamp'].dt.year, movie_rating_df['timestamp'].dt.month])
    earliest = movie_rating_df['timestamp'].min()
    movie_rating_df["months_delta"] = movie_rating_df['timestamp'].apply(lambda x: month_diff(x, earliest))
    movie_rating_df

    freq = movie_rating_df.groupby("months_delta").count()["userId"]
    ratings = movie_rating_df.groupby("months_delta").mean()["rating"]
    data = pd.concat([freq,ratings], axis = 1)
    data
    data["count"] = data["userId"]
    data["ones"] = np.ones(data.shape[0])
    data["months_delta"] = data.index
    y = data["rating"]
    x = data[["count", "months_delta", "ones"]]

    model = sm.OLS(y, x, missing='drop')
    res = model.fit()
    if print:
        print(res.summary())
    return res.params, res.pvalues

In [None]:
# because ratings info start after 1995
# remember to include frequency visualization and unique user count to show that there is no seasonality/gradual increase!!!!
movies_after95 = movies_df[movies_df["release"] >= 1995].reset_index()
movies_after95["movieId"][0]

In [None]:
results = []
for i in range(1000):
    results.append(movie_ols(movies_after95["movieId"][i]))
    print(i/100)

In [None]:
count_pvalues = [results[i][1][0] for i in range(len(results))]
months_pvalues = [results[i][1][1] for i in range(len(results))]

count_coeff = [results[i][0][0] for i in range(len(results))]
months_coeff = [results[i][0][1] for i in range(len(results))]

In [None]:
plt.hist(months_pvalues)

In [None]:
# new variables
movie_rating_df = ratings_df[ratings_df["movieId"] == movieID]
movie_ratings_grouped = movie_rating_df.groupby([movie_rating_df['timestamp'].dt.year, movie_rating_df['timestamp'].dt.month])
movie_rating_df["months_delta"] = movie_rating_df['timestamp'].apply(lambda x: month_diff(x, movie_rating_df['timestamp'].min()))
movie_rating_df

In [None]:
ratings_df.groupby("userId").count()

In [None]:
# user_counts = ratings_df.groupby("userId").count()
# groupby_month = movie_rating_df.groupby("months_delta")
# for name, group in movie_rating_df.groupby("months_delta")["userId"]:
#     print(name)
#     group = group.reset_index
#     print(group)
# movie_rating_df.groupby("months_delta")["userId"]
# movie_rating_df.groupby("months_delta")["userId"].count()

In [None]:
# user_counts = ratings_df.groupby("userId").count()
# user_rating_count = user_counts["rating"]
# user_meanrating = ratings_df.groupby("userId").mean()["rating"]
# groupby_month = movie_rating_df.groupby("months_delta")

In [None]:
user_counts = ratings_df.groupby("userId").count()
user_rating_count = user_counts["rating"]
user_meanrating = ratings_df.groupby("userId").mean()["rating"]
groupby_month = movie_rating_df.groupby("months_delta")
reviewcount_monthly = []
meanrating_monthly = []
for name, group in groupby_month:
    count = len(group["userId"])
    group_users = group["userId"]
    group_reviews = user_rating_count[group["userId"]]
    group_mean = user_meanrating[group["userId"]]
    mean_review_count = group_reviews.mean()
    mean_ratings = group_mean.mean()
    reviewcount_monthly.append(mean_review_count)
    meanrating_monthly.append(mean_ratings)



In [None]:
def movie_timeseries(movieID, ratings = ratings_df):
    movie_rating_df = ratings_df[ratings_df["movieId"] == movieID]
    earliest = movie_rating_df['timestamp'].min()
    movie_rating_df["months_delta"] = movie_rating_df['timestamp'].apply(lambda x: month_diff(x, earliest))
#     print(1)
    freq = movie_rating_df.groupby("months_delta").count()["userId"]
    ratings = movie_rating_df.groupby("months_delta").mean()["rating"]
    data = pd.concat([freq,ratings], axis = 1)
    data
#     print(2)
    groupby = ratings_df.groupby("userId")["rating"]
    user_rating_count = groupby.count()
    user_meanrating = groupby.mean()
    reviewcount_monthly = []
    meanrating_monthly = []
    months_delta = []
#     print(3)
    for name, group in groupby_month:
        count = len(group["userId"])
        group_users = group["userId"]
        group_reviews = user_rating_count[group["userId"]]
        group_mean = user_meanrating[group["userId"]]
        mean_review_count = group_reviews.mean()
        mean_ratings = group_mean.mean()
        reviewcount_monthly.append(mean_review_count)
        meanrating_monthly.append(mean_ratings)
        months_delta.append(group["months_delta"].mean())

    # print(4)
    additional_data = pd.concat([pd.Series(reviewcount_monthly),pd.Series(meanrating_monthly), pd.Series(months_delta)], axis = 1)
    additional_data.columns = ["userreviewcount", "usermeanrating", "months_delta"]
    data["months_delta"] = data.index
    data.index.names = ["index"]
    full_data = data.merge(additional_data, on = ["months_delta"])
    full_data
    # print(5)
    full_data["months_delta"] = full_data["months_delta"] + 1
    full_data["count"] = full_data["userId"]
    full_data = full_data.drop(columns = ["userId"])
    return full_data

In [None]:
movie_timeseries(1)

In [None]:
def apply_ols(results, printout = False):
    results["ones"] = np.ones(results.shape[0])
    y = results["rating"]
    x = results[["months_delta", "usermeanrating", "ones"]]
    model = sm.OLS(y, x, missing='drop')
    res = model.fit()
    if printout:
        print(res.summary())
    return res.params, res.pvalues

In [None]:
ols = apply_ols(movie_timeseries(1))

In [None]:
results = []
for i in range(100):
    results.append(apply_ols(movie_timeseries(movies_after95["movieId"][i])))
    print(i/100)

In [None]:
# three variables analysis
months_pvalues = [results[i][1][0] for i in range(len(results))]
userreviewcount_pvalues = [results[i][1][1] for i in range(len(results))]
usermeanrating_pvalues = [results[i][1][2] for i in range(len(results))]

months_coeffs = [results[i][0][0] for i in range(len(results))]
userreviewcount_coeffs = [results[i][0][1] for i in range(len(results))]
usermeanrating_coeffs = [results[i][0][2] for i in range(len(results))]

In [None]:
# two variables analysis
months_pvalues = [results[i][1][0] for i in range(len(results))]
usermeanrating_pvalues = [results[i][1][1] for i in range(len(results))]
# usermeanrating_pvalues = [results[i][1][2] for i in range(len(results))]

months_coeffs = [results[i][0][0] for i in range(len(results))]
usermeanrating_coeffs = [results[i][0][1] for i in range(len(results))]
# usermeanrating_coeffs = [results[i][0][2] for i in range(len(results))]

In [None]:
plt.hist(usermeanrating_coeffs, bins = 50)

In [None]:
months_pvalues.index(max(months_pvalues))
months_pvalues

In [None]:
movies_after95_ids = pd.read_csv("content/drive/'My Drive'/Berkeley!/Citadel/'West Coast Data Open F20'/data/movies_after95.csv")
more_than_10_months = movies_after95_ids[movies_after95_ids["months_w_data"] >= 10]
more_than_100 = more_than_10_months[more_than_10_months["rating"] >= 60]
regression_movies = more_than_100.reset_index().drop(columns = ["index", "Unnamed: 0"])
# regression_movies.index
regression_movies

In [13]:
ls drive/'My Drive'/Berkeley!/Citadel/'West Coast Data Open F20'/data

genome-scores-pca.csv  movies_after95.csv               netflix_titles.csv
movie_industry.csv     movies_w_genre_profits.csv       the_oscar_award.csv
[0m[01;34mmovie_lense[0m/           netflix_data_genres_encoded.csv


In [6]:
Tfrom google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
results = []
for i in regression_movies.index:
#     print(regression_movies["movieId"][i])
    results.append(apply_ols(movie_timeseries(regression_movies["movieId"][i])))
    print(i/len(regression_movies))

In [None]:
def regressionAnalysis(i, regression_movies_length, results):
  temp = apply_ols(movie_timeseries(regression_movies["movieId"][i]))
  with lock:
    results.append(temp)
    print(i)

In [None]:
lock = threading.Lock()
regression_movies_length = len(regression_movies)
results = []
threads = []
for i in regression_movies.index:
  print(i)
  thread = threading.Thread(target = regressionAnalysis, args=(i, results, ))
  threads.append(thread)
  thead.start()

for t in threads:
  t.join()