In [1]:
import pandas as pd
import zipfile
import gzip
import numpy as np
from sqlalchemy import create_engine, String
from dbconfig import MY_DWH
host, port, user, password, database = MY_DWH()

## IMDb

In [2]:
imdb_name = r'G:\data\name.basics.tsv.gz' # персонажи, профессии, имена
imdb_basic = r'G:\data\title.basics.tsv.gz' # фильмы, сериалы, даты выпуска, жанры
imdb_princip = r'G:\data\title.principals.tsv.gz' # участники
imdb_ratings = r'G:\data\title.ratings.tsv.gz' # рейтинг фильма, кол-во голосов

In [3]:
with gzip.open(imdb_name, "rt", encoding="utf-8") as f:
    df_imdb_name = pd.read_csv(f, sep="\t") 

df_imdb_name.head()

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899,1987,"actor,miscellaneous,producer","tt0072308,tt0050419,tt0027125,tt0031983"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack,archive_footage","tt0037382,tt0075213,tt0117057,tt0038355"
2,nm0000003,Brigitte Bardot,1934,\N,"actress,music_department,producer","tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,1949,1982,"actor,writer,music_department","tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor","tt0050986,tt0069467,tt0050976,tt0083922"


In [4]:
with gzip.open(imdb_basic, "rt", encoding="utf-8") as f:
    df_imdb_basic = pd.read_csv(f, sep="\t", low_memory=False) 

df_imdb_basic.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Poor Pierrot,Pauvre Pierrot,0,1892,\N,5,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [5]:
unique_imdb_genres = sorted(set(genre for sublist in df_imdb_basic['genres'].dropna().str.split(',') for genre in sublist))
unique_imdb_genres

['Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 '\\N']

Примеры фильтрации

In [None]:
genre_to_filter = 'Comedy'
filtered_comedy = df_imdb_basic[df_imdb_basic['genres'].str.contains(genre_to_filter, na=False)]

genres = ['Comedy', 'Romance']
romcomedy = df_imdb_basic[df_imdb_basic['genres'].str.contains('|'.join(genres), na=False)]

In [6]:
with gzip.open(imdb_princip, "rt", encoding="utf-8") as f:
    df_imdb_princip = pd.read_csv(f, sep="\t") 

df_imdb_princip.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0005690,producer,producer,\N
3,tt0000001,4,nm0374658,cinematographer,director of photography,\N
4,tt0000002,1,nm0721526,director,\N,\N


In [7]:
with gzip.open(imdb_ratings, "rt", encoding="utf-8") as f:
    df_imdb_ratings = pd.read_csv(f, sep="\t") 

df_imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2130
1,tt0000002,5.6,289
2,tt0000003,6.4,2166
3,tt0000004,5.3,184
4,tt0000005,6.2,2892


## Movies Of MovieLens

In [8]:
movielens = pd.read_csv(r'Aggregated_Movies_Of_MovieLens.csv', dtype='object')
movielens.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,113041,11862,Father of the Bride Part II (1995),Comedy


In [9]:
movielens.shape

(90592, 5)

Добавляем tt к IMDb_ID

In [10]:
movielens['ttimdbId'] = 'tt' + movielens['imdbId'].astype(str)
movielens.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres,ttimdbId
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy,tt0113497
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance,tt0113228
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885
4,5,113041,11862,Father of the Bride Part II (1995),Comedy,tt0113041


### MovieLens + IMDb

In [14]:
data1 = pd.merge(movielens, df_imdb_basic, left_on='ttimdbId', right_on='tconst', how='left', suffixes=('_movielens', '_imdbasic'))
data1.shape

(90592, 15)

In [15]:
data1.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres_movielens,ttimdbId,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres_imdbasic
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709,tt0114709,movie,Toy Story,Toy Story,0,1995,\N,81,"Adventure,Animation,Comedy"
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy,tt0113497,tt0113497,movie,Jumanji,Jumanji,0,1995,\N,104,"Adventure,Comedy,Family"
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance,tt0113228,tt0113228,movie,Grumpier Old Men,Grumpier Old Men,0,1995,\N,101,"Comedy,Romance"
3,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885,tt0114885,movie,Waiting to Exhale,Waiting to Exhale,0,1995,\N,124,"Comedy,Drama,Romance"
4,5,113041,11862,Father of the Bride Part II (1995),Comedy,tt0113041,tt0113041,movie,Father of the Bride Part II,Father of the Bride Part II,0,1995,\N,106,"Comedy,Family,Romance"


## TMDb - parse

In [12]:
tmdb_movie = pd.read_csv(r'TMDB_TOTAL.csv', sep=';', dtype={'tmdb_id': str}, low_memory=False)
tmdb_movie.shape

(382168, 13)

In [16]:
tmdb_movie.head()

Unnamed: 0,tmdb_id,name,overview,release_date,vote_average,vote_count,popularity,original_language,type,genres,production_companies,network_id,network_name
0,40605,Die Harald Schmidt Show,The Harald Schmidt Show is a German late night...,1995-12-05,7.0,16,349.5288,de,tv,Reality,,163.0,SAT.1
1,14424,Young Hearts,Malhação is a Brazilian television series for ...,1995-04-24,6.1,31,250.4493,pt,tv,Soap,"Cintra Produções, Estúdios Globo",60.0,TV Globo
2,15844,A Kindred Spirit,A Kindred Spirit was a television drama series...,1995-05-15,0.0,0,128.6814,cn,tv,"Drama, Comedy",TVB,48.0,TVB Jade
3,46121,A Próxima Vítima,A young law student starts to work as a detect...,1995-03-13,7.6,23,87.54,pt,tv,"Soap, Crime, Drama",Estúdios Globo,60.0,TV Globo
4,97,The Drew Carey Show,Drew is an assistant director of personnel in ...,1995-09-13,6.55,159,85.9274,en,tv,Comedy,"Warner Bros. Television, Mohawk Productions",2.0,ABC


### MovieLens + IMDb + (TMDb - parse)

In [19]:
data2 = pd.merge(data1, tmdb_movie, left_on='tmdbId', right_on='tmdb_id', how='inner', suffixes=('__imdbasic', '_tmdbparse'))
data2.shape

(83903, 28)

In [20]:
data2.head()

Unnamed: 0,movieId,imdbId,tmdbId,title,genres_movielens,ttimdbId,tconst,titleType,primaryTitle,originalTitle,...,release_date,vote_average,vote_count,popularity,original_language,type,genres,production_companies,network_id,network_name
0,1,114709,862,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709,tt0114709,movie,Toy Story,Toy Story,...,1995-11-22,7.968,18727,22.4492,en,movie,"Animation, Adventure, Family, Comedy",Pixar,,
1,2,113497,8844,Jumanji (1995),Adventure|Children|Fantasy,tt0113497,tt0113497,movie,Jumanji,Jumanji,...,1995-12-15,7.236,10707,2.9785,en,movie,"Adventure, Fantasy, Family","TriStar Pictures, Interscope Communications, T...",,
2,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance,tt0113228,tt0113228,movie,Grumpier Old Men,Grumpier Old Men,...,1995-12-22,6.457,394,1.2765,en,movie,"Romance, Comedy","Lancaster Gate, Warner Bros. Pictures",,
3,3,113228,15602,Grumpier Old Men (1995),Comedy|Romance,tt0113228,tt0113228,movie,Grumpier Old Men,Grumpier Old Men,...,2007-11-10,0.0,0,0.765,en,tv,,,214.0,Sky One
4,4,114885,31357,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885,tt0114885,movie,Waiting to Exhale,Waiting to Exhale,...,1995-12-22,6.284,171,1.2568,en,movie,"Comedy, Drama, Romance",20th Century Fox,,


## KinoPoisk

In [24]:
engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
with engine.connect() as conn:
    kino = pd.read_sql_table('MOVIES_BATCH', conn, schema='public', dtype_backend='numpy_nullable')

# csv_file = r'C:\Users\admin\Downloads\kinopoisk_movies.csv'
# kino = pd.read_csv(csv_file, sep=';', low_memory=False)
kino.shape

(79644, 53)

In [25]:
kino.columns

Index(['status', 'movieLength', 'id', 'type', 'name', 'year', 'genres',
       'countries', 'typeNumber', 'enName', 'ageRating', 'ratingMpaa',
       'updatedAt', 'createdAt', 'shortDescription', 'ticketsOnSale', 'top10',
       'top250', 'releaseYears', 'deletedAt', 'isSeries', 'seriesLength',
       'totalSeriesLength', 'networks', 'isTmdbChecked', 'fees.world.value',
       'fees.world.currency', 'fees.russia.value', 'fees.russia.currency',
       'fees.usa.value', 'fees.usa.currency', 'externalId.imdb',
       'externalId.tmdb', 'externalId.kpHD', 'rating.kp', 'rating.imdb',
       'rating.filmCritics', 'rating.russianFilmCritics', 'rating.await',
       'votes.kp', 'votes.imdb', 'votes.filmCritics',
       'votes.russianFilmCritics', 'votes.await', 'premiere.world',
       'premiere.russia', 'premiere.bluray', 'premiere.dvd', 'premiere.cinema',
       'premiere.digital', 'budget.value', 'budget.currency', 'audience'],
      dtype='object')

### MovieLens + IMDb + (TMDb - parse) + KinoPoisk

In [27]:
data3 = pd.merge(data2, kino, left_on='ttimdbId', right_on='externalId.imdb', how='inner', suffixes=('_tmdbparse', '_kino'))
data3.shape

(76023, 81)

## Для обогащения данных

### Актеры Кинопоиска

In [26]:
with engine.connect() as conn:
    actors = pd.read_sql_table('ACTORS_BATCH', conn, schema='public', dtype_backend='numpy_nullable')
actors.head()

Unnamed: 0,movie_id,actor_id,actor_name,actor_en_name,role,profession
0,313650,361485,Алис Ги,Alice Guy,,режиссеры
1,315565,1313301,Рэйчел Джиллет,Rachel Gillet,Little Red Riding Hood,актеры
2,315565,415210,Жорж Мельес,Georges Méliès,,режиссеры
3,315565,65078,Шарль Перро,Charles Perrault,,сценаристы
4,214609,372469,Эдвин С. Портер,Edwin S. Porter,,режиссеры


## MovieLens Ratings

In [93]:
averarings = pd.read_csv('Aggregated_AverageRating_Of_MovieLens.csv')
averarings.head()

Unnamed: 0,movieId,averageRating,numVotes
0,1,3.9,76813
1,2,3.3,30209
2,3,3.2,15820
3,4,2.9,3028
4,5,3.1,15801


In [94]:
ratings = pd.read_csv('Aggregated_Ratings_Of_MovieLens.csv', low_memory=False)
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2008-11-03 17:52:19
1,1,110,4.0,2008-11-05 06:04:46
2,1,158,4.0,2008-11-03 17:31:43
3,1,260,4.5,2008-11-03 18:00:04
4,1,356,5.0,2008-11-03 17:58:39


In [95]:
ratings.shape

(101253874, 4)

## MovieLens Tags

In [96]:
tags = pd.read_csv('Aggregated_Tags_Of_MovieLens.csv', sep=';', dtype={'movieId': str, 'userId': str})
tags['timestamp'] = pd.to_datetime(tags['timestamp'], unit='s')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,42915,275243,Baz Luhrmann,2023-10-12 23:56:23
1,42915,275243,Austin Butler,2023-10-12 23:56:21
2,42915,275243,1970s,2023-10-12 23:56:11
3,42915,275243,1960s,2023-10-12 23:56:08
4,42915,275243,elvis presley,2023-10-12 23:56:06


1. Самые популярные теги

In [97]:
# top_tags = tags['tag'].value_counts().head(20)
# top_tags.plot(kind='bar', title='Top 20 tags')

2. Топ тегов по фильмам

In [98]:
# top_tags_by_movie = tags.groupby('movieId')['tag'].apply(lambda x: x.value_counts().head(3)).reset_index(name='count')

3. Какие фильмы имеют больше всего тегов

In [99]:
# most_tagged_movies = tags.groupby('movieId')['tag'].count().sort_values(ascending=False).head(10)

4. Когда люди ставили больше всего тегов

In [100]:
# tags['year'] = tags['timestamp'].dt.year
# tags_per_year = tags.groupby('year')['tag'].count()
# tags_per_year.plot(title='Number of tags per year')

5. Поиск по тегам: например, найти все фильмы с тегом "psychology"

In [101]:
# psychology_movies = tags[tags['tag'].str.contains('psychology')]

In [206]:
# engine = create_engine(f'postgresql://{user}:{password}@{host}:{port}/{database}')
# with engine.connect() as conn:
#     kino_batch = pd.read_sql_table('MOVIES_BATCH', conn, schema='public', dtype_backend='numpy_nullable')

# # csv_file = r'C:\Users\admin\Downloads\kinopoisk_movies.csv'
# # kino_batch = pd.read_csv(csv_file, sep=';', low_memory=False)
# kino_batch.head()