In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./df.csv', names = ['user_id', 'user_name', 'rating', 'item_id', 'item_name'], nrows = 50000)

In [8]:
demid = pd.read_csv('./demid.csv', names = ['user_id', 'user_name', 'rating', 'item_id', 'item_name'])

In [11]:
df = pd.concat([df, demid], axis=0)

In [15]:
df = df.reset_index()

In [16]:
from surprise import Dataset, SVD
from surprise.model_selection import cross_validate
from surprise import Reader, Dataset, SVD

In [17]:
reader = Reader()

In [18]:
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

In [19]:
svd = SVD()

In [20]:
from surprise.model_selection import cross_validate

cross_validate(svd,data,measures=['RMSE', 'MAE'],cv=5, verbose=False, n_jobs=-1)

{'test_rmse': array([3.48636   , 3.53378542, 3.49611281, 3.51209581, 3.49264418]),
 'test_mae': array([3.09151012, 3.12992158, 3.1065092 , 3.11023692, 3.0998078 ]),
 'fit_time': (1.144577980041504,
  1.0330817699432373,
  0.9988000392913818,
  1.0596020221710205,
  0.5319700241088867),
 'test_time': (0.16104745864868164,
  0.1397843360900879,
  0.16773653030395508,
  0.09035253524780273,
  0.08431792259216309)}

In [21]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcb78bafd00>

In [45]:


import pickle
import pandas as pd
import numpy as np
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

class BestTitle():
    def __init__(self):
        animes = pd.read_csv('./animes_big.csv')
        feat = ['id',
            'name', 'russian', 'kind', 'score',
            'episodes', 'released_on','rates_scores_stats', 'rates_statuses_stats', 
            'rating', 'description',
            'duration', 'genres'
            ]
        df = animes[feat].copy()
        self.__df = df
        self.__sim_matrix = None
        
    def filter_votes(self,corpus):
        votes = 0
        translator = str.maketrans('', '', string.punctuation)
        for idx, word in enumerate(corpus):
            if 'value' in word:
                 votes += int(corpus[idx+1].translate(translator))
        return votes

    def filter_genres(self,corpus):
        genres = {}
        translator = str.maketrans('', '', string.punctuation)
        for idx, word in enumerate(corpus):
            if 'name' in word:
                name_genre = corpus[idx+1].translate(translator)
                id_genre = int(corpus[idx-1].translate(translator))
                genres[id_genre] = name_genre
        return genres

    def vector_genres(self, df):
        vector_genres = {}
        for idx, dicts in enumerate(df):
            tmp = dicts.keys()
            for i in tmp:
                if ~(i in vector_genres.keys()):
                    vector_genres[i] = dicts[i]
        return sorted(vector_genres)

    def build_vector(self, d, vector_genres):
        vector = []
        for i in vector_genres:
            if i in d:
                vector.append(1)
            else: 
                vector.append(0)
        return vector
    
    def weighted_rating(self, df, quantile=0.95):
        wr = []
        vote_counts = df['vote_count']
        C = df['score'].mean()
        v = df['vote_count']
        R = df['score']
        m = vote_counts.quantile(quantile)
        wr.append((v/(v+m) * R) + (m/(m+v) * C))
        return wr
    
    def genres_features(self):
        res = []
        n = self.vector_genres(self.__df['genres'])
        for i in self.__df['genres']:
            res.append(self.build_vector(i, n))
        res = np.array(res)
        return pd.concat([self.__df, pd.DataFrame(res)], axis=1)
        
    def fit(self, enc=LabelEncoder()):
        self.__df['genres'] = self.__df['genres'].str.split().apply(self.filter_genres)
        self.__df['vote_count'] = self.__df['rates_scores_stats'].str.split().apply(self.filter_votes)
        self.__df['wr'] = self.weighted_rating(self.__df)[0]
        self.__df['rating'] = enc.fit_transform(self.__df['rating'])
        self.__df['kind'] = enc.fit_transform(self.__df['kind'])
        self.__df = self.genres_features()
        smd = self.__df[['kind', 'episodes', 'rating', 'duration', 1, 2, 3, 4, 5, 6,
                            7,                      8,                      9,
                           10,                     11,                     12,
                           13,                     14,                     15,
                           16,                     17,                     18,
                           19,                     20,                     21,
                           22,                     23,                     24,
                           25,                     26,                     27,
                           28,                     29,                     30,
                           31,                     32,                     33,
                           34,                     35,                     36,
                           37,                     38,                     39,
                           40,                     41,                     42,
                           43,                     44 ]].copy().values
        self.__sim_matrix = sim_soup = cosine_similarity(smd, smd)

    def search(self, title):
        title = title.lower()
        corpus = self.__df['russian'].str.lower()
        means = []
        for idx, name in enumerate(corpus):
            if title == name:
                return self.predict(self.__df.iloc[idx]['russian'])
            else:
                for i in str(name).split():
                    if title == i:
                        means.append(name)
        if len(means) != 0:
            return means[:5]
        return 'i dont know such anime'
            
    def predict(self, user_id, title):
        indices = pd.Series(dict(zip(tuple(self.__df['russian']), self.__df.index)))
        titles = self.__df[['id','name', 'russian', 'wr', 'kind']]
        idx = indices[title]
        sim_scores = list(enumerate(self.__sim_matrix[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:31]
        title_indices = [i[0] for i in sim_scores]
        ans = titles.iloc[title_indices].sort_values(by='wr', ascending=False)
        ans = ans[:15]
        ans['est'] = ans['id'].apply(lambda x: svd.predict(user_id, x).est)
        ans = ans.sort_values('est', ascending=False)
        return ans

In [46]:
bt = BestTitle()
bt.fit()

In [47]:
bt.predict(250056, 'Наруто').head(10)

Unnamed: 0,id,name,russian,wr,kind,est
833,918,Gintama,Гинтама,8.120343,5,5.0
4694,6702,Fairy Tail,Хвост Феи,7.341083,5,5.0
734,813,Dragon Ball Z,Драконий жемчуг Зет,6.760623,5,5.0
494,527,Pokemon,Покемон,6.654126,5,5.0
12,22,Tennis no Oujisama,Принц тенниса,6.474668,5,5.0
1421,1565,Pokemon Diamond & Pearl,Покемон: Алмаз и жемчуг,6.150104,5,5.0
1420,1564,Pokemon Advanced Generation,Покемон: Современное поколение,6.114709,5,5.0
1181,1293,Urusei Yatsura,Несносные пришельцы,6.092257,5,5.0
451,481,Yu☆Gi☆Oh! Duel Monsters,Югио! Дуэльные монстры,6.074975,5,5.0
7192,15061,Aikatsu!,Айкацу!,6.041013,5,5.0


In [49]:
bt.predict(250056, 'Корона грешника')

Unnamed: 0,id,name,russian,wr,kind,est
2647,2904,Code Geass: Hangyaku no Lelouch R2,Код Гиас: Восставший Лелуш 2,8.598952,5,5
718,790,Ergo Proxy,Эрго Прокси,7.501525,5,5
51,71,Full Metal Panic!,Стальная тревога!,7.024763,5,5
7354,16049,Toaru Kagaku no Railgun S,Некий научный Рейлган S,7.020711,5,5
8,18,Initial D Fourth Stage,Инициал Ди: Стадия четвёртая,7.009735,5,5
243,267,Gungrave,Гангрейв: Убийца с того света,6.844383,5,5
16,26,Texhnolyze,Технолайз,6.756486,5,5
6992,13599,Robotics;Notes,Записки о робототехнике,6.568074,5,5
3895,4981,Casshern Sins,Грехи Кассяна,6.373664,5,5
847,935,Witchblade,Клинок ведьм,6.318731,5,5


In [50]:
bt.predict(250056, 'Токийский гуль').head(10)

Unnamed: 0,id,name,russian,wr,kind,est
6589,11741,Fate/Zero 2nd Season,Судьба/Начало 2,8.020666,5,5.0
804,889,Black Lagoon,Пираты «Чёрной лагуны»,7.680209,5,5.0
1379,1519,Black Lagoon: The Second Barrage,Пираты «Чёрной лагуны»: Второй залп,7.678749,5,5.0
202,226,Elfen Lied,Эльфийская песнь,7.303591,5,5.0
8510,21843,Shingeki no Bahamut: Genesis,Ярость Бахамута: Генезис,7.093256,5,5.0
7103,14345,Btooom!,Бтууум!,7.043954,5,5.0
9267,25183,Gangsta.,Гангста,7.041326,5,5.0
456,486,Kino no Tabi: The Beautiful World,Путешествие Кино: Прекрасный мир,6.904925,5,5.0
8256,20431,Hoozuki no Reitetsu,Хладнокровный Хозуки,6.82296,5,5.0
1566,1726,Devil May Cry,Дьявол может плакать,6.64757,5,5.0


In [51]:
bt.predict(250056, 'Тетрадь смерти').head(10)

Unnamed: 0,id,name,russian,wr,kind,est
221,245,Great Teacher Onizuka,Крутой учитель Онидзука,8.348833,5,5
7810,18179,Yowamushi Pedal,Трусливый велосипедист,6.783721,5,5
7627,17389,Kingdom 2nd Season,Царство 2,6.73697,5,5
6655,12031,Kingdom,Царство,6.691635,5,5
6003,9996,Hyouge Mono,Чудные вещи,5.9848,5,5
6885,13145,Cardfight!! Vanguard: Asia Circuit-hen,Карточные бои Авангарда 2,5.980173,5,5
2062,2252,Devilman,Человек-дьявол,5.960722,5,5
866,961,Virtua Fighter,Виртуальный боец,5.926646,5,5
1154,1264,Yoroiden Samurai Troopers,Чудотворные рыцари,5.926062,5,5
1260,1391,Future GPX Cyber Formula,Кибер-Формула ГПИкс нового времени,5.924311,5,5


In [30]:
bt.predict(28124, 'Тетрадь смерти').head(10)

Unnamed: 0,id,name,russian,wr,kind,est
221,245,Great Teacher Onizuka,Крутой учитель Онидзука,8.348833,5,5.0
7627,17389,Kingdom 2nd Season,Царство 2,6.73697,5,4.844247
6655,12031,Kingdom,Царство,6.691635,5,4.687324
5937,9882,High School Mystery: Gakuen Nanafushigi,Тайны средней школы: Семь школьных чудес,5.919227,5,4.631076
6804,12767,Yuusei Kamen,Планетная маска,5.91839,5,4.631076
4313,5829,Blocker Gundan IV Machine Blaster,Корпус блокировки IV,5.918487,5,4.631076
2213,2416,Grander Musashi RV,Великий Мусаси РВ,5.918575,5,4.631076
3379,3932,Magnerobo Ga-Keen,Магнетический робот Га-Кин,5.918575,5,4.631076
1514,1666,Babel Nisei,Вавилон Второй,5.918585,5,4.631076
3498,4121,Meimon! Daisan Yakyuu-bu,"Мэймон, бейсбольный клуб №3",5.918606,5,4.631076


In [27]:
svd.predict(1,1)

Prediction(uid=1, iid=1, r_ui=None, est=5, details={'was_impossible': False})

In [28]:
svd.predict(401,1)

Prediction(uid=401, iid=1, r_ui=None, est=5, details={'was_impossible': False})

In [29]:
svd.predict(16312,1)

Prediction(uid=16312, iid=1, r_ui=None, est=5, details={'was_impossible': False})

In [31]:
from lightfm.data import Dataset
from lightfm import LightFM

In [37]:
def get_coo_matrix(df, 
                   user_col='user_id', 
                   item_col='item_id', 
                   weight_col=None, 
                   users_mapping=users_mapping, 
                   items_mapping=items_mapping):
    if weight_col is None:
        weights = np.ones(len(df), dtype=np.float32)
    else:
        weights = df[weight_col].astype(np.float32)

    interaction_matrix = sp.coo_matrix((
        weights, 
        (
            df[user_col].map(users_mapping.get), 
            df[item_col].map(items_mapping.get)
        )
    ))
    return interaction_matrix

NameError: name 'users_mapping' is not defined

In [33]:
dataset.fit(df['user_id'].unique(), df['item_id'].unique())

In [36]:
model = LightFM(no_components=30)
model.fit(, epochs=20)

AttributeError: 'Dataset' object has no attribute 'tocoo'