In [1]:
import pandas as pd

ratings_df = pd.read_csv('data/ratings.csv', delimiter=',')
movies_df = pd.read_csv('data/movies.csv', delimiter=',')


In [2]:
ratings_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  62423 non-null  int64 
 1   title    62423 non-null  object
 2   genres   62423 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


In [4]:
df = pd.DataFrame()

genres = list(movies_df['genres'])

df['splitted'] = movies_df['genres'].str.split('|')

In [5]:
print(df['splitted'])

0        [Adventure, Animation, Children, Comedy, Fantasy]
1                           [Adventure, Children, Fantasy]
2                                        [Comedy, Romance]
3                                 [Comedy, Drama, Romance]
4                                                 [Comedy]
                               ...                        
62418                                              [Drama]
62419                                        [Documentary]
62420                                      [Comedy, Drama]
62421                                 [(no genres listed)]
62422                           [Action, Adventure, Drama]
Name: splitted, Length: 62423, dtype: object


In [6]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

df = df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(df.pop('splitted')),
                index=df.index,
                columns=mlb.classes_))
# print(df)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype           
---  ------              --------------  -----           
 0   (no genres listed)  62423 non-null  Sparse[int32, 0]
 1   Action              62423 non-null  Sparse[int32, 0]
 2   Adventure           62423 non-null  Sparse[int32, 0]
 3   Animation           62423 non-null  Sparse[int32, 0]
 4   Children            62423 non-null  Sparse[int32, 0]
 5   Comedy              62423 non-null  Sparse[int32, 0]
 6   Crime               62423 non-null  Sparse[int32, 0]
 7   Documentary         62423 non-null  Sparse[int32, 0]
 8   Drama               62423 non-null  Sparse[int32, 0]
 9   Fantasy             62423 non-null  Sparse[int32, 0]
 10  Film-Noir           62423 non-null  Sparse[int32, 0]
 11  Horror              62423 non-null  Sparse[int32, 0]
 12  IMAX                62423 non-null  Sparse[int32, 0]
 13  Musical         

In [7]:
df2 = df.sparse.to_dense()
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   (no genres listed)  62423 non-null  int32
 1   Action              62423 non-null  int32
 2   Adventure           62423 non-null  int32
 3   Animation           62423 non-null  int32
 4   Children            62423 non-null  int32
 5   Comedy              62423 non-null  int32
 6   Crime               62423 non-null  int32
 7   Documentary         62423 non-null  int32
 8   Drama               62423 non-null  int32
 9   Fantasy             62423 non-null  int32
 10  Film-Noir           62423 non-null  int32
 11  Horror              62423 non-null  int32
 12  IMAX                62423 non-null  int32
 13  Musical             62423 non-null  int32
 14  Mystery             62423 non-null  int32
 15  Romance             62423 non-null  int32
 16  Sci-Fi              62423 non-null  int3

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report

X = df2
X.info()
y = movies_df[['title']]
# y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62423 entries, 0 to 62422
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype
---  ------              --------------  -----
 0   (no genres listed)  62423 non-null  int32
 1   Action              62423 non-null  int32
 2   Adventure           62423 non-null  int32
 3   Animation           62423 non-null  int32
 4   Children            62423 non-null  int32
 5   Comedy              62423 non-null  int32
 6   Crime               62423 non-null  int32
 7   Documentary         62423 non-null  int32
 8   Drama               62423 non-null  int32
 9   Fantasy             62423 non-null  int32
 10  Film-Noir           62423 non-null  int32
 11  Horror              62423 non-null  int32
 12  IMAX                62423 non-null  int32
 13  Musical             62423 non-null  int32
 14  Mystery             62423 non-null  int32
 15  Romance             62423 non-null  int32
 16  Sci-Fi              62423 non-null  int3

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)
model = SVC(kernel='linear', C=10, probability=True,random_state=0)
model.fit(X_train,y_train.values.ravel())


y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

initial_type = [('float_input', FloatTensorType([None, 380]))]
options = {id(model): {'nocl': True, 'zipmap': False}}

onx = convert_sklearn(model, initial_types=initial_type, options=options)
with open("./model.onnx", "wb") as f:
    f.write(onx.SerializeToString())