In [1]:
import pandas as pd
import numpy as np
import re
import math
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from collections import Counter
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier


In [2]:
class OneHotEncoderWithLabels(TransformerMixin):
    """
    Takes named Series, OHEs it and return Data Frame
    with names
    """
    def __init__(self):
        """
        Initalizes new instance of KNNInlineImputer
        """
        TransformerMixin.__init__(self)
        self.ohe = OneHotEncoder(sparse=False)
    
    def fit(self, X, y=None):
        """
        Fits to X by creating data frame to be returned
        """
        basic_name = X.columns[0]
        values = sorted(list(set(X[basic_name].values)))
        self.res_df = pd.DataFrame(self.ohe.fit_transform(np.array(X[basic_name]).reshape(-1,1)))
        index = 0
        self.feature_names = values
        for col in self.res_df:
            self.res_df.rename(columns={col:str(basic_name + '_' + values[index])}, inplace=True)
            index += 1
        return self
        
    def transform(self, X, y=None):
        """
        Transforms X, but really returns Data Frame
        that created at 'fit' step
        """
        return self.res_df
    
    def get_feature_names(self):
        return self.feature_names

In [3]:
class ImputeSymbolsByMean(TransformerMixin):
    
    def __init__(self, missing_symbol=np.nan, rnd=False):
        self.miss_symbol = missing_symbol
        self.rnd = rnd
        self.imputer = SimpleImputer()
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_copy = X.copy()
        X_copy = [np.nan if x == '-' else x for x in X]
        X_copy = self.imputer.fit_transform(np.array(X_copy).reshape(-1,1))
        #if self.rnd:
        #    X_copy = pd.Series(X_copy[0]).apply(round)
        return X_copy

In [4]:
def nums_in_str(s):
    l = len(s)
    integ = []
    i = 0
    while i < l:
        s_int = ''
        a = s[i]
        while '0' <= a <= '9':
            s_int += a
            i += 1
            if i < l:
                a = s[i]
            else:
                break
        i += 1
        if s_int != '':
            integ.append(int(s_int))
    return integ

nums_in_str('Время107 мин. / 01:47')

[107, 1, 47]

In [5]:
def fix_dur(br_dur):
    if len(br_dur) > 0:
        return nums_in_str(br_dur)[0]
    else:
        return '-'

In [6]:
def fix_act(br_act):
    i = 1
    while i != len(br_act):
        if br_act[i-1].islower() and br_act[i].isupper() or br_act[i-1] == 'I' and br_act[i] != 'I' and br_act[i].isupper():
            br_act = br_act[:i] + ', ' + br_act[i:]
        i += 1
    return br_act

In [7]:
df = pd.read_csv('films2.csv', ';', encoding='cp1251')
df['year'] = [s[-4::] for s in df['year']]
df['country'] = [s[6::] for s in df['country']]
df['genre'] = [s[4::].replace('слова', '') for s in df['genre']]
df['director'] = [s[8::] for s in df['director']]
df['scripter'] = [s[8::] for s in df['scripter']]
df['producer'] = [str(s)[8::] for s in df['producer']]
df['premier_data'] = [str(s)[15:-5:] for s in df['premier_data']]
df['age_bound'] = [str(s)[7::] for s in df['age_bound']]
df['duration'] = [fix_dur(d) for d in df['duration']]
df['actors'] = [fix_act(a) for a in df['actors']]

for index, row in df.iterrows():
    if not row['year'].isdigit():
        df.drop(index, inplace=True)
        

#for index, row in df.iterrows():
#    if len(nums_in_str(row['duration'])) > 0:
#        row['duration'] = nums_in_str(row['duration'])[0]
#    else:
#        row['duration'] = '-'
for index, row in df.iterrows():
    row['scripter'] = row['scripter'].replace(', ...', '')
#for index, row in df.iterrows():
#    if '+' in row['title']:
#        del row

#for index, row in df.iterrows():      
#    row['rating'] = row['rating'].replace('%', '')
#    if row['rating'] != '–':
#        if float(row['rating']) > 10:
#            row['rating'] = float(row['rating'])/10
    
df = df.loc[~df['title'].isin(['18+', '16+', '6+', '12+', '0+'])]
      
del df['producer']
del df['premier_data']
del df['budget']
del df['Nan']
df = df.dropna()
df

Unnamed: 0,year,country,genre,director,scripter,age_bound,duration,title,actors,rating,rus_title
2,2003,"Новая Зеландия, США","фэнтези, приключения, драма",Питер Джексон,"Фрэн Уолш, Филиппа Бойенс, Питер Джексон, ...",12+,201,The Lord of the Rings: The Return of the King,"Тим Роббинс, Морган Фриман, Боб Гантон, Уильям...",8.6,Властелин колец: Возвращение короля (2003)
3,2002,"Новая Зеландия, США","фэнтези, приключения, драма",Питер Джексон,"Фрэн Уолш, Филиппа Бойенс, Стивен Синклер, ...",12+,179,The Lord of the Rings: The Two Towers,"Тим Роббинс, Морган Фриман, Боб Гантон, Уильям...",8.6,Властелин колец: Две крепости (2002)
4,2001,"Новая Зеландия, США","фэнтези, приключения, драма",Питер Джексон,"Фрэн Уолш, Филиппа Бойенс, Питер Джексон, ...",12+,178,The Lord of the Rings: The Fellowship of the Ring,"Тим Роббинс, Морган Фриман, Боб Гантон, Уильям...",8.6,Властелин колец: Братство Кольца (2001)
5,2014,"США, Великобритания, Канада","фантастика, драма, приключения",Кристофер Нолан,"Джонатан Нолан, Кристофер Нолан",16+,169,Interstellar,"Тим Роббинс, Морган Фриман, Боб Гантон, Уильям...",8.6,Интерстеллар (2014)
6,1994,США,"комедия, драма, история, мелодрама, военный",Роберт Земекис,"Эрик Рот, Уинстон Грум",12+,142,Forrest Gump,"Тим Роббинс, Морган Фриман, Боб Гантон, Уильям...",8.9,Форрест Гамп (1994)
...,...,...,...,...,...,...,...,...,...,...,...
914,2010,"Великобритания, Швеция, Дания, Ирландия","фантастика, драма, мелодрама",Дэвид Маккензи,Ким Фупц Окесон,16+,88,Perfect Sense,"Винсент Прайс, Франка Беттойя, Эмма Даниели, Д...",7.6,Последняя любовь на Земле
916,2004,США,"фантастика, триллер, драма, приключения",Роланд Эммерих,"Роланд Эммерих, Джеффри Начманофф",12+,124,The Day After Tomorrow,"Дон Мак, Келлар, Сандра ОРоберта Максвелл, Роб...",7.7,Послезавтра
922,1998,США,"фантастика, боевик, триллер, драма, мелодрама",Мими Ледер,"Брюс Джоэл Рубин, Майкл Толкин",12+,120,Deep Impact,"Щин Ха-гюн, Пэк Юн-щик, Хван Джон-мин, Ли Джэ-...",7.0,Столкновение с бездной
927,2007,США,"фантастика, боевик, триллер, драма, приключения",Френсис Лоуренс,"Марк Протосевич, Акива Голдсман, Джон Уильям К...",16+,96,I Am Legend,"Чарлтон Хестон, Энтони Цербе, Розалинд Кэш, По...",7.9,Я – легенда


In [8]:
df['country_'] = df['country'].apply(lambda x: x.split(', '))

counter = Counter([item for sublist in df['country_'].values for item in sublist])
top_amount = len(counter)
top_countries = counter.most_common(top_amount)
other_countries = list(counter.keys())[top_amount::]
print(other_countries)
for key, value in top_countries:
    df['country_' + key] = df['country_'].apply(lambda x : 1 if key in x else 0)


df['country_other'] = df['country_'].apply(lambda x : 1 if True in list(other_country in x for other_country in other_countries) else 0)



[]


In [9]:
df['genre_'] = df['genre'].apply(lambda x: x.split(', '))

counter = Counter([item for sublist in df['genre_'].values for item in sublist])
top_amount = len(counter)
top_countries = counter.most_common(top_amount)
other_countries = list(counter.keys())[top_amount::]
print(other_countries)
for key, value in top_countries:
    df['genre_' + key] = df['genre_'].apply(lambda x : 1 if key in x else 0)


df['genre_other'] = df['genre_'].apply(lambda x : 1 if True in list(other_country in x for other_country in other_countries) else 0)



[]


In [10]:
del df['genre']
del df['genre_']
del df['country']
del df['country_']

In [11]:
ohe_labels = OneHotEncoderWithLabels()
df = pd.concat([df, ohe_labels.fit_transform(df[['age_bound']])], axis=1)

In [12]:
df = df.dropna()

In [13]:
del df['age_bound']
del df['director']
del df['actors']
del df['scripter']

In [14]:
df

Unnamed: 0,year,duration,title,rating,rus_title,country_США,country_Великобритания,country_Германия,country_Франция,country_Канада,...,genre_музыка,genre_спорт,genre_вестерн,genre_фильм-нуар,genre_other,age_bound_0+,age_bound_12+,age_bound_16+,age_bound_18+,age_bound_6+
2,2003,201.0,The Lord of the Rings: The Return of the King,8.6,Властелин колец: Возвращение короля (2003),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2002,179.0,The Lord of the Rings: The Two Towers,8.6,Властелин колец: Две крепости (2002),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2001,178.0,The Lord of the Rings: The Fellowship of the Ring,8.6,Властелин колец: Братство Кольца (2001),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,2014,169.0,Interstellar,8.6,Интерстеллар (2014),1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,1994,142.0,Forrest Gump,8.9,Форрест Гамп (1994),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,2011,96.0,The Guard,7.6,Однажды в Ирландии,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
486,2008,120.0,Slumdog Millionaire,7.7,Миллионер из трущоб,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
487,1954,112.0,Rear Window,8.0,Окно во двор,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
488,2017,164.0,Blade Runner 2049,7.7,Бегущий по лезвию 2049,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [15]:
imputer = ImputeSymbolsByMean(missing_symbol='-', rnd=True)
df['duration'] = imputer.fit_transform(df['duration'])
df['rating'] = imputer.fit_transform(df['rating'])

In [16]:
df

Unnamed: 0,year,duration,title,rating,rus_title,country_США,country_Великобритания,country_Германия,country_Франция,country_Канада,...,genre_музыка,genre_спорт,genre_вестерн,genre_фильм-нуар,genre_other,age_bound_0+,age_bound_12+,age_bound_16+,age_bound_18+,age_bound_6+
2,2003,201.0,The Lord of the Rings: The Return of the King,8.6,Властелин колец: Возвращение короля (2003),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2002,179.0,The Lord of the Rings: The Two Towers,8.6,Властелин колец: Две крепости (2002),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,2001,178.0,The Lord of the Rings: The Fellowship of the Ring,8.6,Властелин колец: Братство Кольца (2001),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,2014,169.0,Interstellar,8.6,Интерстеллар (2014),1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,1994,142.0,Forrest Gump,8.9,Форрест Гамп (1994),1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,2011,96.0,The Guard,7.6,Однажды в Ирландии,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
486,2008,120.0,Slumdog Millionaire,7.7,Миллионер из трущоб,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
487,1954,112.0,Rear Window,8.0,Окно во двор,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
488,2017,164.0,Blade Runner 2049,7.7,Бегущий по лезвию 2049,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [17]:
minmax_scaler = MinMaxScaler()
df[['rating', 'duration', 'year']] = minmax_scaler.fit_transform(df[['rating', 'duration', 'year']])
df
df_no_titles = df.copy()
del df_no_titles['title']
del df_no_titles['rus_title']
df_no_titles

Unnamed: 0,year,duration,rating,country_США,country_Великобритания,country_Германия,country_Франция,country_Канада,country_Япония,country_Гонконг,...,genre_музыка,genre_спорт,genre_вестерн,genre_фильм-нуар,genre_other,age_bound_0+,age_bound_12+,age_bound_16+,age_bound_18+,age_bound_6+
2,0.780488,0.790816,0.785714,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.768293,0.678571,0.785714,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.756098,0.673469,0.785714,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,0.914634,0.627551,0.785714,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,0.670732,0.489796,1.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,0.878049,0.255102,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
486,0.841463,0.377551,0.142857,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
487,0.182927,0.336735,0.357143,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
488,0.951220,0.602041,0.142857,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [18]:
def div_list_by(lst, num):
    for i in range(len(lst)):
        lst[i] /= num
    return lst

In [19]:
def distance(x,y):
    return math.sqrt(sum(x**2-y**2))

In [20]:
def weight(rating):
    return 3.2 - 0.35*rating

In [21]:
def user_with_films_to_footprint(film_list):
    curr_user_footprint = [0]*(df_films.shape[1] - 2)
    
    films_watched_by_user = df[df['rus_title'].isin(film_list)]
    films_watched_by_user_found_count = 0
    for index, row in films_watched_by_user.iterrows():
        f = df_films[df_films['rus_title'] == row['rus_title']]
        for index, row in f.iterrows():
            curr_user_footprint += row.drop(['rus_title', 'title'])
            films_watched_by_user_found_count += len(f)
    if films_watched_by_user_found_count > 0:
        curr_user_footprint = div_list_by(curr_user_footprint, films_watched_by_user_found_count)
    new_col = pd.DataFrame(np.array(curr_user_footprint).reshape(1,-1))
    return curr_user_footprint

In [22]:
df_users = pd.read_csv('users.csv', ';', encoding='cp1251')
df_films = df

In [31]:
df['rus_title'].values

array(['Властелин колец: Возвращение короля (2003)',
       'Властелин колец: Две крепости (2002)',
       'Властелин колец: Братство Кольца (2001)', 'Интерстеллар (2014)',
       'Форрест Гамп (1994)', 'Король Лев (1994)', 'Тайна Коко (2017)',
       'Список Шиндлера (1993)', 'ВАЛЛ·И (2008)',
       'Криминальное чтиво (1994)', 'Назад в будущее (1985)',
       'Начало (2010)', 'Унесённые призраками (2001)', '1+1 (2011)',
       'Большой куш (2000)', 'Остров проклятых (2009)', 'Матрица (1999)',
       'Пираты Карибского моря: Проклятие Черной жемчужины (2003)',
       'Бойцовский клуб (1999)', 'Душа (2020)', 'Шрэк (2001)',
       'Престиж (2006)', 'Один дома (1990)', 'Темный рыцарь (2008)',
       'Невидимая сторона (2009)', 'Гарри Поттер и узник Азкабана (2004)',
       'Назад в будущее 2 (1989)', 'Игры разума (2001)',
       'Гарри Поттер и философский камень (2001)', 'Гладиатор (2000)',
       'Одержимость (2013)', 'Джанго освобожденный (2012)',
       'Твоё имя (2016)', 'Зеленая кн

In [35]:
watched_films = ['Тор: Рагнарёк (2017)',
                 'Индиана Джонс: В поисках утраченного ковчега (1981)',
                 'Бегущий по лезвию (1982)',
                 'Назад в будущее (1985)',
                 'Назад в будущее 2 (1989)',
                 'Назад в будущее 3 (1990)',
                 'Железный человек (2008)',
                 'Далласский клуб покупателей (2013)',
                 'Мстители (2012)',
                 'Королевство полной луны (2012)',
                 'Мост в Терабитию (2006)',
                 'Стражи Галактики (2014)',
                 'Стражи Галактики. Часть 2 (2017)',
                 'Криминальное чтиво (1994)',
                 'Бойцовский клуб (1999)',
                 'Твоё имя (2016)',
                 'Начало (2010)',
                 'Ковбой Бибоп (2001)'
                ]

In [24]:
def predict_similar_films(watched_films, n_to_return=10):
    # 1. taking mean 'footprint' of the user (input data to fit)
    # 2. creating the same for all users (dataset2)
    # 3. finding N nearest neighbors from dataset2
    # 4. creating Counter of their films (dataset3)
    # 5. according to views and ratings give
    #    greater or lesser weights to films in dataset1
    # 6. finding K nearest weighted neighbors from dataset1
    #    for mean 'footprint' of the user
    global df_users
    all_ids = list(set(df_users['id']))
    all_ids

    user_footprints_df = pd.DataFrame(np.array([0]*(df_users.shape[1] - 2)).reshape(-1,1))

    curr_user_footprint = [0]*(df_films.shape[1] - 2)

    for user_id in all_ids:
        curr_user_footprint = [0]*(df_films.shape[1] - 2)
        films_watched_by_user = df_users[df_users['id'] == user_id]
        films_watched_by_user_found_count = 0
        for index, row in films_watched_by_user.iterrows():
            f = df_films[df_films['rus_title'] == row['rus_title']]
            for index, row in f.iterrows():
                curr_user_footprint += row.drop(['rus_title', 'title'])
                films_watched_by_user_found_count += len(f)
        if films_watched_by_user_found_count > 0:
            curr_user_footprint = div_list_by(curr_user_footprint, films_watched_by_user_found_count)
        new_col = pd.DataFrame(np.array(curr_user_footprint).reshape(1,-1))
        new_col.index = [user_id,]
        user_footprints_df = pd.concat([new_col, user_footprints_df], axis=0)

    user_footprints_df

    user_to_predict_from = user_with_films_to_footprint(watched_films)
    user_footprints_df = user_footprints_df.dropna()
    user_footprints_df
    df_users = df_users.dropna()

    similar_users_knn = KNeighborsClassifier(n_neighbors=10)
    similar_users_knn.fit(user_footprints_df, user_footprints_df.index)
    indexes = similar_users_knn.kneighbors(X=np.array(user_to_predict_from).reshape(1,-1))[1]
    near_users_ids = user_footprints_df.iloc[indexes[0]].index

    films_near_users = []
    for near_user_id in near_users_ids:
        films_near_users.append((near_user_id, list(df_users[df_users['id'] == near_user_id]['rus_title'])))
    films_to_ratings = []

    for user_id, user_lst in films_near_users:
        for film in user_lst:
            films_to_ratings.append((film, df_users[(df_users['id'] == user_id) &
                                                    (df_users['rus_title'] == film)].iloc[0]['rating']
                                    ))

    df['weights'] = [1.]*df.shape[0]
    df['weights'].astype(np.float64)
    for film_plus_rating in films_to_ratings:
        if df[df['rus_title'] == film_plus_rating[0]].shape[0] > 0:
            ind = df.loc[df['rus_title'] == film_plus_rating[0]].index[0]
            df.at[ind, 'weights'] += weight(film_plus_rating[1])

    index_to_distance = []
    for index, row in df_no_titles.iterrows():
        distance = 0
        for i in range(len(film)):
            distance += (row[i] - user_to_predict_from[i]) ** 2
        distance *= df.loc[index]['weights']
        index_to_distance.append((df.loc[index]['rus_title'], distance))
    index_to_distance = sorted(index_to_distance, key=lambda x : x[1])

    return index_to_distance[:n_to_return]

In [40]:
predict_similar_films(watched_films, n_to_return=15)

[('Назад в будущее 2 (1989)', 0.01354457102419131),
 ('Военный ныряльщик (2000)', 0.0218700099134486),
 ('Как приручить дракона (2010)', 0.02743762012993949),
 ('Терминал (2004)', 0.029265886493835505),
 ('Одиннадцать друзей Оушена (2001)', 0.029917061344995426),
 ('Красота по-американски (1999)', 0.03124064313964553),
 ('Малышка на миллион (2004)', 0.03199623033205106),
 ('Назад в будущее (1985)', 0.033842909313239675),
 ('Умница Уилл Хантинг (1997)', 0.03391596247926114),
 ('Рататуй (2007)', 0.03512978949776428),
 ('Октябрьское небо (1999)', 0.03593776008166044),
 ('Пираты Карибского моря: Сундук мертвеца (2006)', 0.03730213812010362),
 ('Звёздные войны: Эпизод 3 — Месть ситхов (2005)', 0.03863687863530474),
 ('Шрэк (2001)', 0.03881528275361214),
 ('Дневник памяти (2004)', 0.03890307470245222)]