In [1]:
import pandas as pd
import numpy as np

In [2]:
# some important variables
SEED = 42

In [3]:
# read dataset
rating_df = pd.read_table('../data/raw/ml-100k/u.data', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

rating_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [4]:
movies_df = pd.read_csv(
    '../data/raw/ml-100k/u.item', 
    header=None, 
    encoding='latin-1', 
    sep='|', 
    names=[
        'item_id', 'title', 'release_date', 'video_release_date', 'url', 
        'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy',
        'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'
    ],
    index_col=0
)

movies_df.head()

Unnamed: 0_level_0,title,release_date,video_release_date,url,unknown,action,adventure,animation,childrens,comedy,...,fantasy,film_noir,horror,musical,mystery,romance,sci_fi,thriller,war,western
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [5]:
users_df = pd.read_csv(
    '../data/raw/ml-100k/u.user', 
    header=None, 
    sep='|', 
    names=[
        'user_id', 'age', 'gender', 'occupation', 'zip_code'
    ],
    index_col=0
)

users_df.head()

Unnamed: 0_level_0,age,gender,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [6]:
# join all these tables into one big
raw_total_df = pd.merge(rating_df, movies_df, on='item_id', how='inner')
raw_total_df = pd.merge(raw_total_df, users_df, on='user_id', how='inner')

raw_total_df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,release_date,video_release_date,url,unknown,action,...,mystery,romance,sci_fi,thriller,war,western,age,gender,occupation,zip_code
0,196,242,3,881250949,Kolya (1996),24-Jan-1997,,http://us.imdb.com/M/title-exact?Kolya%20(1996),0,0,...,0,0,0,0,0,0,49,M,writer,55105
1,196,257,2,881251577,Men in Black (1997),04-Jul-1997,,http://us.imdb.com/M/title-exact?Men+in+Black+...,0,1,...,0,0,1,0,0,0,49,M,writer,55105
2,196,111,4,881251793,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,,http://us.imdb.com/M/title-exact?Truth%20About...,0,0,...,0,1,0,0,0,0,49,M,writer,55105
3,196,25,4,881251955,"Birdcage, The (1996)",08-Mar-1996,,"http://us.imdb.com/M/title-exact?Birdcage,%20T...",0,0,...,0,0,0,0,0,0,49,M,writer,55105
4,196,382,4,881251843,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,,http://us.imdb.com/M/title-exact?Adventures%20...,0,0,...,0,0,0,0,0,0,49,M,writer,55105


In [7]:
# remove fully-unique columns and columns with no information

total_df = raw_total_df.drop(columns=['user_id', 'item_id', 'video_release_date', 'url', 'timestamp'])
total_df

Unnamed: 0,rating,title,release_date,unknown,action,adventure,animation,childrens,comedy,crime,...,mystery,romance,sci_fi,thriller,war,western,age,gender,occupation,zip_code
0,3,Kolya (1996),24-Jan-1997,0,0,0,0,0,1,0,...,0,0,0,0,0,0,49,M,writer,55105
1,2,Men in Black (1997),04-Jul-1997,0,1,1,0,0,1,0,...,0,0,1,0,0,0,49,M,writer,55105
2,4,"Truth About Cats & Dogs, The (1996)",26-Apr-1996,0,0,0,0,0,1,0,...,0,1,0,0,0,0,49,M,writer,55105
3,4,"Birdcage, The (1996)",08-Mar-1996,0,0,0,0,0,1,0,...,0,0,0,0,0,0,49,M,writer,55105
4,4,"Adventures of Priscilla, Queen of the Desert, ...",01-Jan-1994,0,0,0,0,0,1,0,...,0,0,0,0,0,0,49,M,writer,55105
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,5,Titanic (1997),01-Jan-1997,0,1,0,0,0,0,0,...,0,1,0,0,0,0,48,F,administrator,33763
99996,4,G.I. Jane (1997),01-Jan-1997,0,1,0,0,0,0,0,...,0,0,0,0,1,0,48,F,administrator,33763
99997,3,Desperate Measures (1998),30-Jan-1998,0,0,0,0,0,0,1,...,0,0,0,1,0,0,48,F,administrator,33763
99998,2,Spawn (1997),01-Aug-1997,0,1,1,0,0,0,0,...,0,0,1,1,0,0,48,F,administrator,33763


In [8]:
X = total_df.drop(columns=['rating'])
y = total_df.rating

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin


class ToArray(BaseEstimator, TransformerMixin):
    def __init__(self, ignore: bool = False):
        super().__init__()
        self.ignore = ignore
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X if self.ignore else X.toarray()



ignore_columns = [
    'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 
    'crime', 'documentary', 'drama', 'fantasy', 'film_noir', 'horror', 
    'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 'war', 'western'
]

numerical_features = list(set(X.select_dtypes(include='number').columns) - set(ignore_columns))
categorical_features = X.select_dtypes(exclude='number').columns
print('Categorical features:', categorical_features)


numerical_preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_preprocessing = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_preprocessing, numerical_features),
    ('cat', categorical_preprocessing, categorical_features),
])

Categorical features: Index(['title', 'release_date', 'gender', 'occupation', 'zip_code'], dtype='object')


In [10]:
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.kernel_ridge import KernelRidge

from catboost import CatBoostRegressor

from sklearn.model_selection import cross_val_score


models = [
    SGDRegressor(random_state=SEED),
    LinearRegression(),
    BayesianRidge(),
    CatBoostRegressor(random_seed=SEED, loss_function='RMSE', silent=True)
]

array_models = [

]


for model in models:
    final_pipeline = Pipeline(steps=[
        ('preprocess', preprocessor),
        ('transformation', ToArray(ignore=not(type(model) is BayesianRidge or type(model) is KernelRidge()))),
        ('classifier', model)
    ])
    
    
    scores = cross_val_score(final_pipeline, X, y, cv=5, scoring='neg_root_mean_squared_error')
    print(repr(model), scores.mean(), scores, sep='\t')

SGDRegressor(random_state=42)	-1.0459875899299718	[-1.07523569 -1.06982669 -1.01331849 -0.99004971 -1.08150737]
LinearRegression()	-1.0558224220470735	[-1.05833988 -1.08038993 -1.02573035 -0.98666169 -1.12799026]
BayesianRidge()	-1.0373399037231557	[-1.04666378 -1.06588585 -1.00509604 -0.97341089 -1.09564297]
<catboost.core.CatBoostRegressor object at 0x000001DC5B2BB7F0>	-1.056303982208018	[-1.10623244 -1.0916919  -1.01029227 -0.99620311 -1.0771002 ]


In [11]:
# As we can see catboost is superiour to other models, so I will use it
model = CatBoostRegressor(random_seed=SEED, depth=10, iterations=200, learning_rate=0.1, loss_function='RMSE')

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED)

X_train = preprocessor.fit_transform(X_train, y_train)
X_test = preprocessor.transform(X_test)

model.fit(X_train, y_train)

0:	learn: 1.1197839	total: 66.4ms	remaining: 13.2s
1:	learn: 1.1150336	total: 132ms	remaining: 13.1s
2:	learn: 1.1114082	total: 200ms	remaining: 13.2s
3:	learn: 1.1086286	total: 287ms	remaining: 14s
4:	learn: 1.1060956	total: 354ms	remaining: 13.8s
5:	learn: 1.1033226	total: 423ms	remaining: 13.7s
6:	learn: 1.1012519	total: 490ms	remaining: 13.5s
7:	learn: 1.0991249	total: 554ms	remaining: 13.3s
8:	learn: 1.0975313	total: 618ms	remaining: 13.1s
9:	learn: 1.0958998	total: 679ms	remaining: 12.9s
10:	learn: 1.0942954	total: 740ms	remaining: 12.7s
11:	learn: 1.0930193	total: 812ms	remaining: 12.7s
12:	learn: 1.0918133	total: 870ms	remaining: 12.5s
13:	learn: 1.0906086	total: 929ms	remaining: 12.3s
14:	learn: 1.0894265	total: 986ms	remaining: 12.2s
15:	learn: 1.0884788	total: 1.05s	remaining: 12s
16:	learn: 1.0874089	total: 1.1s	remaining: 11.9s
17:	learn: 1.0863656	total: 1.16s	remaining: 11.8s
18:	learn: 1.0854074	total: 1.22s	remaining: 11.6s
19:	learn: 1.0843710	total: 1.28s	remaining: 

161:	learn: 1.0188995	total: 10s	remaining: 2.35s
162:	learn: 1.0186123	total: 10.1s	remaining: 2.29s
163:	learn: 1.0183082	total: 10.2s	remaining: 2.23s
164:	learn: 1.0180229	total: 10.2s	remaining: 2.17s
165:	learn: 1.0177374	total: 10.3s	remaining: 2.11s
166:	learn: 1.0174460	total: 10.4s	remaining: 2.05s
167:	learn: 1.0171696	total: 10.4s	remaining: 1.99s
168:	learn: 1.0169158	total: 10.5s	remaining: 1.93s
169:	learn: 1.0166962	total: 10.6s	remaining: 1.86s
170:	learn: 1.0164458	total: 10.6s	remaining: 1.8s
171:	learn: 1.0161836	total: 10.7s	remaining: 1.74s
172:	learn: 1.0159175	total: 10.8s	remaining: 1.68s
173:	learn: 1.0156266	total: 10.8s	remaining: 1.62s
174:	learn: 1.0153423	total: 10.9s	remaining: 1.55s
175:	learn: 1.0150708	total: 10.9s	remaining: 1.49s
176:	learn: 1.0148018	total: 11s	remaining: 1.43s
177:	learn: 1.0145482	total: 11.1s	remaining: 1.37s
178:	learn: 1.0141931	total: 11.1s	remaining: 1.3s
179:	learn: 1.0139487	total: 11.2s	remaining: 1.24s
180:	learn: 1.0136

<catboost.core.CatBoostRegressor at 0x1dc5b2cba00>

In [13]:
from sklearn import metrics

metrics.mean_squared_error(model.predict(X_test), y_test)

1.0435473815023322

In [14]:
metrics.median_absolute_error(model.predict(X_test), y_test)

0.6354533141982313

In [15]:
model.feature_importances_[model.feature_importances_ > 0.05]

array([7.02138334, 0.06357178, 0.05986691, 0.07306514, 0.05204784,
       0.0578698 , 0.21459072, 0.06825027, 0.26163192, 0.12444399,
       0.07309795, 0.20786188, 0.13362357, 0.28308257, 0.21785816,
       0.09185754, 0.10863646, 0.08899745, 0.27201548, 0.15016459,
       0.10002863, 0.28709815, 0.08496946, 0.05369246, 0.22928409,
       0.17949697, 0.20596578, 0.28708526, 0.08766546, 0.41588875,
       0.31148323, 0.06114937, 0.07341505, 0.05042765, 0.10623271,
       0.12936392, 0.15329479, 0.17732426, 0.16727784, 0.22036397,
       0.07736362, 0.14599953, 0.36584998, 0.05173733, 0.11247971,
       0.32998816, 0.06593803, 0.06108848, 0.66763419, 0.1035404 ,
       0.09829481, 0.1388361 , 0.24900169, 0.15702631, 0.09040754,
       0.2627835 , 0.48016864, 0.05174764, 0.38921807, 0.05360244,
       0.11494696, 0.07687757, 0.06158152, 0.07316559, 0.07320205,
       0.10715341, 0.11224225, 0.1626306 , 0.4826548 , 0.05114515,
       0.0590696 , 0.35034259, 0.05200261, 0.1809946 , 0.14528

In [16]:
sgd = SGDRegressor(random_state=SEED)
sgd.fit(X_train, y_train)

In [17]:
metrics.mean_squared_error(sgd.predict(X_test), y_test)

0.9523631961172166

In [18]:
metrics.median_absolute_error(sgd.predict(X_test), y_test)

0.6721052489651154

In [19]:
br = BayesianRidge()
br.fit(X_train.toarray(), y_train)

In [20]:
metrics.mean_squared_error(br.predict(X_test.toarray()), y_test)

0.9046588670933299

In [21]:
metrics.median_absolute_error(br.predict(X_test.toarray()), y_test)

0.6422905344111884

In [22]:
X_test.shape

(20000, 2693)

In [28]:
metrics.mean_squared_error(br.predict(X_test.toarray()), y_test, squared=False)

0.9511355671476752