In [1]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

os.makedirs("model", exist_ok=True)

df = pd.read_csv("data/n_movies.csv")

print(f"Загружено фильмов: {len(df)}")
df.head()

Загружено фильмов: 9957


Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


In [2]:
df['year_num'] = df['year'].str.extract(r'(\d{4})').astype(float)

In [3]:
all_genres = df['genre'].str.split(', ').explode()
top_genres = all_genres.value_counts().head(15)

In [4]:
all_genres = df['genre'].str.split(', ').explode()
genre_freq = all_genres.value_counts()

text = " ".join(all_genres.dropna())

In [5]:
df = df.drop_duplicates(subset="title").reset_index(drop=True)

df["genre"] = df["genre"].fillna("")
df["description"] = df["description"].fillna("")
df["stars"] = df["stars"].fillna("")

df["features"] = (
    df["genre"] + " " + 
    df["stars"].str.replace(r'[\[\]\'\",]', ' ', regex=True) + " " + 
    df["description"]
)
df["features"] = df["features"].str.replace(r'\s+', ' ', regex=True).str.strip()

print(f"После очистки осталось: {len(df)} фильмов")
df[["title", "features"]].head()

После очистки осталось: 7912 фильмов


Unnamed: 0,title,features
0,Cobra Kai,"Action, Comedy, Drama Ralph Macchio William Za..."
1,The Crown,"Biography, Drama, History Claire Foy Olivia Co..."
2,Better Call Saul,"Crime, Drama Bob Odenkirk Rhea Seehorn Jonatha..."
3,Devil in Ohio,"Drama, Horror, Mystery Emily Deschanel Sam Jae..."
4,Cyberpunk: Edgerunners,"Animation, Action, Adventure Zach Aguilar Keni..."


In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(df["features"])

similarity_matrix = cosine_similarity(tfidf_matrix)

print("Модель готова!")
print(f"Матрица сходства: {similarity_matrix.shape}")

Модель готова!
Матрица сходства: (7912, 7912)


In [7]:
np.save("model/similarity_matrix.npy", similarity_matrix)
pickle.dump(df["title"].tolist(), open("model/titles.pkl", "wb"))
pickle.dump(vectorizer, open("model/vectorizer.pkl", "wb"))
df.to_pickle("model/movies_data.pkl")

print("Всё сохранено в папку model!")

Всё сохранено в папку model!
