# Phase 2: Analyse des données

In [1]:
from ibmoviesdk import MovieClient, MovieConfig

* 'orm_mode' has been renamed to 'from_attributes'


In [3]:
config = MovieConfig(movie_base_url="https://data-movie-app-back.onrender.com")
client = MovieClient(config=config)

client.health_check()

MOVIE_API_BASE_URL in MovieConfig init: https://data-movie-app-back.onrender.com


{'message': 'API MovieLens Opérationnelle'}

In [4]:
movie = client.get_movie(20)
print(f"Titre : {movie.title}")
print(f"Genre: {movie.genres}")

Titre : Money Train (1995)
Genre: Action|Comedy|Crime|Drama|Thriller


In [5]:
ratings_df = client.list_ratings(limit=10, output_format="pandas")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     10 non-null     int64  
 1   movieId    10 non-null     int64  
 2   rating     10 non-null     float64
 3   timestamp  10 non-null     int64  
dtypes: float64(1), int64(3)
memory usage: 452.0 bytes


In [7]:
analytics = client.get_analytics()
print(analytics)

movie_count=9742 rating_count=100836 tag_count=100836 link_count=9742


In [8]:
import time
import pandas as pd


total_ratings = client.get_analytics().rating_count
batch_size = 1000
all_ratings = []

for skip in range(0, total_ratings, batch_size):
    print(f"Fetching ratings from {skip} to {skip + batch_size}")
    batch_df  = client.list_ratings(limit=batch_size, skip=skip, output_format="pandas")
    all_ratings.append(batch_df)
    time.sleep(0.5)  # To avoid hitting the rate limit
    complete_ratings_df = pd.concat(all_ratings, ignore_index=True)
    # complete_ratings_df

Fetching ratings from 0 to 1000
Fetching ratings from 1000 to 2000
Fetching ratings from 2000 to 3000
Fetching ratings from 3000 to 4000
Fetching ratings from 4000 to 5000
Fetching ratings from 5000 to 6000
Fetching ratings from 6000 to 7000
Fetching ratings from 7000 to 8000
Fetching ratings from 8000 to 9000
Fetching ratings from 9000 to 10000
Fetching ratings from 10000 to 11000
Fetching ratings from 11000 to 12000
Fetching ratings from 12000 to 13000
Fetching ratings from 13000 to 14000
Fetching ratings from 14000 to 15000
Fetching ratings from 15000 to 16000
Fetching ratings from 16000 to 17000
Fetching ratings from 17000 to 18000
Fetching ratings from 18000 to 19000
Fetching ratings from 19000 to 20000
Fetching ratings from 20000 to 21000
Fetching ratings from 21000 to 22000
Fetching ratings from 22000 to 23000
Fetching ratings from 23000 to 24000
Fetching ratings from 24000 to 25000
Fetching ratings from 25000 to 26000
Fetching ratings from 26000 to 27000
Fetching ratings from 2

In [9]:
# Concatenate all the batches into a single DataFrame
complete_ratings_df = pd.concat(all_ratings, ignore_index=True)
complete_ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [10]:
rating_per_user = complete_ratings_df['userId'].value_counts().rename_axis('userId').reset_index(name='rating_count')
rating_per_user

Unnamed: 0,userId,rating_count
0,414,2698
1,599,2478
2,474,2108
3,448,1864
4,274,1346
...,...,...
605,442,20
606,278,20
607,147,20
608,320,20


In [11]:
total_ratings = client.get_analytics().rating_count
batch_size = 1000

from collections import defaultdict
user_rating_counts = defaultdict(int)

for skip in range(0, total_ratings, batch_size):
    print(f"Fetching ratings from {skip} to {skip + batch_size}...")
    batch_df  = client.list_ratings(limit=batch_size, skip=skip, output_format="pandas")
    
    batch_countS = batch_df['userId'].value_counts()


    for user_id, count in batch_countS.items():
        user_rating_counts[user_id] += count

    time.sleep(0.5)  # To avoid hitting the rate limit
# Convert the defaultdict to a DataFrame
rating_per_user = pd.DataFrame(list(user_rating_counts.items()), columns=['userId', 'rating_count'])
rating_per_user

Fetching ratings from 0 to 1000...
Fetching ratings from 1000 to 2000...
Fetching ratings from 2000 to 3000...
Fetching ratings from 3000 to 4000...
Fetching ratings from 4000 to 5000...
Fetching ratings from 5000 to 6000...
Fetching ratings from 6000 to 7000...
Fetching ratings from 7000 to 8000...
Fetching ratings from 8000 to 9000...
Fetching ratings from 9000 to 10000...
Fetching ratings from 10000 to 11000...
Fetching ratings from 11000 to 12000...
Fetching ratings from 12000 to 13000...
Fetching ratings from 13000 to 14000...
Fetching ratings from 14000 to 15000...
Fetching ratings from 15000 to 16000...
Fetching ratings from 16000 to 17000...
Fetching ratings from 17000 to 18000...
Fetching ratings from 18000 to 19000...
Fetching ratings from 19000 to 20000...
Fetching ratings from 20000 to 21000...
Fetching ratings from 21000 to 22000...
Fetching ratings from 22000 to 23000...
Fetching ratings from 23000 to 24000...
Fetching ratings from 24000 to 25000...
Fetching ratings from 

Unnamed: 0,userId,rating_count
0,6,314
1,1,232
2,4,216
3,7,152
4,5,44
...,...,...
605,604,100
606,608,831
607,607,187
608,610,1302


In [12]:
# trier les utilisateurs par le nombre de notes
rating_per_user = rating_per_user.sort_values(by='rating_count', ascending=False)
# afficher les 10 utilisateurs ayant le plus de notes
top_10_users = rating_per_user.head(10)
print(top_10_users)

     userId  rating_count
410     414          2698
597     599          2478
471     474          2108
438     448          1864
268     274          1346
608     610          1302
66       68          1260
373     380          1218
603     606          1115
287     288          1055


## Question Business pertinentes

## Quels genres de films les utilisateurs taguent le plus positivement(note>=4)?

In [13]:
chunk_size = 1000
skip = 0
all_high_ratings = []

while True:
    chunk = client.list_ratings(
        skip=skip,
        limit=chunk_size,
        min_rating=4.0,
        output_format="pandas"
    )
    if chunk.empty:
        break

    all_high_ratings.append(chunk)
    skip += chunk_size
    time.sleep(0.5)  # To avoid hitting the rate limit

    high_rating_df = pd.concat(all_high_ratings, ignore_index=True)
    print(high_rating_df.shape)
    high_rating_df.head() 

(1000, 4)
(2000, 4)
(3000, 4)
(4000, 4)
(5000, 4)
(6000, 4)
(7000, 4)
(8000, 4)
(9000, 4)
(10000, 4)
(11000, 4)
(12000, 4)
(13000, 4)
(14000, 4)
(15000, 4)
(16000, 4)
(17000, 4)
(18000, 4)
(19000, 4)
(20000, 4)
(21000, 4)
(22000, 4)
(23000, 4)
(24000, 4)
(25000, 4)
(26000, 4)
(27000, 4)
(28000, 4)
(29000, 4)
(30000, 4)
(31000, 4)
(32000, 4)
(33000, 4)
(34000, 4)
(35000, 4)
(36000, 4)
(37000, 4)
(38000, 4)
(39000, 4)
(40000, 4)
(41000, 4)
(42000, 4)
(43000, 4)
(44000, 4)
(45000, 4)
(46000, 4)
(47000, 4)
(48000, 4)
(48580, 4)


In [14]:
# Supprimer les doublons
user_movie_pairs = high_rating_df[['userId', 'movieId']].drop_duplicates()
user_movie_pairs

Unnamed: 0,userId,movieId
0,1,1
1,1,3
2,1,6
3,1,47
4,1,50
...,...,...
48575,610,166528
48576,610,166534
48577,610,168248
48578,610,168250


In [15]:
# Récupérer tous les tags correspondant aux paires utilisateur-film

all_tags = []
chunk_size = 1000
skip = 0

while True:
    tag_chunk = client.list_tags(skip=skip, limit=chunk_size, output_format="pandas")
    if tag_chunk.empty:
        break
    all_tags.append(tag_chunk)
    skip += chunk_size
    time.sleep(0.5)  # To avoid hitting the rate limit
    all_tags_df = pd.concat(all_tags, ignore_index=True)

    all_tags_df

In [16]:
tagged_high_ratings = pd.merge(
    user_movie_pairs,
    all_tags_df,
    on=['userId', 'movieId'])
print(tagged_high_ratings.shape)
tagged_high_ratings.head()

(2378, 4)


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [17]:
# Récupérer les genres des films

def get_movie_genres(movie_id):
    try:
        movie = client.get_movie(movie_id)
        return movie.genres
    except Exception as e:
        print(f"Error fetching genres for movie {movie_id}: {e}")
        return ""

# Appliquer uniquement aux movieId uniques qu'on a dans tagged_high_ratings
unique_movie_ids = tagged_high_ratings['movieId'].unique()

movie_genres = {
    movie_id: get_movie_genres(movie_id) for movie_id in unique_movie_ids
}

# " Ajoutons la colonne genres à tagged_high_ratings"
tagged_high_ratings['genres'] = tagged_high_ratings['movieId'].map(movie_genres)
print(tagged_high_ratings.shape)
tagged_high_ratings

(2378, 5)


Unnamed: 0,userId,movieId,tag,timestamp,genres
0,2,60756,funny,1445714994,Comedy
1,2,60756,Highly quotable,1445714996,Comedy
2,2,60756,will ferrell,1445714992,Comedy
3,2,89774,Boxing story,1445715207,Drama
4,2,89774,MMA,1445715200,Drama
...,...,...,...,...,...
2373,606,6107,World War II,1178473747,Drama|War
2374,606,7382,for katie,1171234019,Drama|Mystery|Thriller
2375,610,3265,gun fu,1493843984,Action|Crime|Drama|Thriller
2376,610,3265,heroic bloodshed,1493843978,Action|Crime|Drama|Thriller


In [18]:
tagged_high_ratings

Unnamed: 0,userId,movieId,tag,timestamp,genres
0,2,60756,funny,1445714994,Comedy
1,2,60756,Highly quotable,1445714996,Comedy
2,2,60756,will ferrell,1445714992,Comedy
3,2,89774,Boxing story,1445715207,Drama
4,2,89774,MMA,1445715200,Drama
...,...,...,...,...,...
2373,606,6107,World War II,1178473747,Drama|War
2374,606,7382,for katie,1171234019,Drama|Mystery|Thriller
2375,610,3265,gun fu,1493843984,Action|Crime|Drama|Thriller
2376,610,3265,heroic bloodshed,1493843978,Action|Crime|Drama|Thriller


In [19]:
# Aggregation finale: genre, tag count

# separer les genres en plusieurs lignes s'ils sont séparés par "|"
tagged_high_ratings['genres'] = tagged_high_ratings['genres'].str.split('|')
tagged_exploded = tagged_high_ratings.explode('genres')

tagged_exploded

Unnamed: 0,userId,movieId,tag,timestamp,genres
0,2,60756,funny,1445714994,Comedy
1,2,60756,Highly quotable,1445714996,Comedy
2,2,60756,will ferrell,1445714992,Comedy
3,2,89774,Boxing story,1445715207,Drama
4,2,89774,MMA,1445715200,Drama
...,...,...,...,...,...
2376,610,3265,heroic bloodshed,1493843978,Drama
2376,610,3265,heroic bloodshed,1493843978,Thriller
2377,610,168248,Heroic Bloodshed,1493844270,Action
2377,610,168248,Heroic Bloodshed,1493844270,Crime


In [22]:
# compter les combianaisons genre-tag
genre_tag_counts = (
     tagged_exploded
     .groupby(['genres', 'tag'])
     .size()
     .reset_index(name='count')
     .sort_values(by='count', ascending=False)
)

genre_tag_counts

Unnamed: 0,genres,tag,count
1971,Drama,In Netflix queue,20
2159,Drama,atmospheric,19
4321,Thriller,twist ending,16
3280,Mystery,twist ending,14
4304,Thriller,suspense,14
...,...,...,...
18,Action,Borg,1
17,Action,Ben Stiller,1
4464,Western,oil,1
4463,Western,music,1
