In [1]:
import pandas as pd
df_title = pd.read_csv("title.basics.tsv", sep = "\t")
df_ratings = pd.read_csv("title.ratings.tsv", sep = "\t")

  df_title = pd.read_csv("title.basics.tsv", sep = "\t")


In [2]:
# Join both TSVs on common column
df_joined = df_title.join(df_ratings.set_index("tconst"), on = "tconst")

In [3]:
df_joined.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",5.7,1965.0
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",5.8,262.0
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",6.5,1804.0
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",5.6,178.0
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",6.2,2603.0


In [4]:
df_joined.columns

Index(['tconst', 'titleType', 'primaryTitle', 'originalTitle', 'isAdult',
       'startYear', 'endYear', 'runtimeMinutes', 'genres', 'averageRating',
       'numVotes'],
      dtype='object')

In [5]:
# Drop Columns that isn't required for Training
df = df_joined[["isAdult", "startYear", "runtimeMinutes", "averageRating"]]

In [6]:
# Data Cleaning
df = df.dropna() # Remove NaNs
df = df[df['startYear'] != "\\N"] # Remove non-numeric values
df = df[df['runtimeMinutes'] != "\\N"] # Remove non-numeric values
df['startYear'] = df['startYear'].astype(int) # Change type to int
df['runtimeMinutes'] = df['runtimeMinutes'].astype(int) # Change type to int

In [7]:
from sklearn.preprocessing import StandardScaler
# Scale values
scaler = StandardScaler()
scaler.fit(df[['startYear', 'runtimeMinutes']])
df[['startYear', 'runtimeMinutes']] = scaler.transform(df[['startYear', 'runtimeMinutes']])

In [8]:
# Choose random 30K values for training + testing
df = df.sample(frac = 0.1).reset_index(drop = True)
df = df[:30000]

In [9]:
# Split into features and labels
X = df.drop(['averageRating'], axis = 1)
y = df['averageRating']

In [10]:
from sklearn.model_selection import train_test_split
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11695)

In [11]:
from sklearn.ensemble import RandomForestRegressor
# Fit Model
model = RandomForestRegressor()
model.fit(X_train, y_train)

# Save Model as Pkl file
import joblib
joblib.dump(model, "model/rfm.pkl")

['model/rfm.pkl']

In [12]:
# Predict
y_preds = model.predict(X_test)

In [13]:
# R2 Score
from sklearn.metrics import r2_score
score = r2_score(y_test, y_preds)
print(score)

0.020824342776192717


In [14]:
# Working with MLEM - Save Models with MLEM
from mlem.api import save
# instead of joblib.dump, use save
save(model, "model/ratings_model", sample_data = X)

MlemModel(location=Location(path='/Users/eesha/mlem-demo/model/ratings_model.mlem', project=None, rev=None, uri='file:///Users/eesha/mlem-demo/model/ratings_model.mlem', project_uri=None, fs=<fsspec.implementations.local.LocalFileSystem object at 0x312b1f490>), params={}, artifacts={'data': LocalArtifact(uri='ratings_model', size=69402637, hash='8272be320ad3236d542908deca17bc2a')}, requirements=Requirements(__root__=[InstallableRequirement(module='sklearn', version='1.1.2', package_name='scikit-learn', extra_index=None, source_url=None, vcs=None, vcs_commit=None), InstallableRequirement(module='pandas', version='1.5.3', package_name=None, extra_index=None, source_url=None, vcs=None, vcs_commit=None), InstallableRequirement(module='numpy', version='1.24.2', package_name=None, extra_index=None, source_url=None, vcs=None, vcs_commit=None)]), processors_cache={'model': SklearnModel(model=RandomForestRegressor(), io=SimplePickleIO(), methods={'predict': Signature(name='predict', args=[Argum

In [15]:
# Alternatively, load pre-saved pickle files and save them as MLEM models
m = joblib.load("model/rfm.pkl")
save(m, "model/rfm2", sample_data = X)

MlemModel(location=Location(path='/Users/eesha/mlem-demo/model/rfm2.mlem', project=None, rev=None, uri='file:///Users/eesha/mlem-demo/model/rfm2.mlem', project_uri=None, fs=<fsspec.implementations.local.LocalFileSystem object at 0x312b1f490>), params={}, artifacts={'data': LocalArtifact(uri='rfm2', size=69402790, hash='1a77e2cda84faf496fe884a07061a7d5')}, requirements=Requirements(__root__=[InstallableRequirement(module='sklearn', version='1.1.2', package_name='scikit-learn', extra_index=None, source_url=None, vcs=None, vcs_commit=None), InstallableRequirement(module='pandas', version='1.5.3', package_name=None, extra_index=None, source_url=None, vcs=None, vcs_commit=None), InstallableRequirement(module='numpy', version='1.24.2', package_name=None, extra_index=None, source_url=None, vcs=None, vcs_commit=None)]), processors_cache={'model': SklearnModel(model=RandomForestRegressor(), io=SimplePickleIO(), methods={'predict': Signature(name='predict', args=[Argument(name='X', type_=DataFra