### Supondo um filme com as seguintes características:

{'Series_Title': 'The Shawshank Redemption',
'Released_Year': '1994',
'Certificate': 'A',
'Runtime': '142 min',
'Genre': 'Drama',
'Overview': 'Two imprisoned men bond over a number of years,
finding solace and eventual redemption through acts of common
decency.',
'Meta_score': 80.0,
'Director': 'Frank Darabont',
'Star1': 'Tim Robbins',
'Star2': 'Morgan Freeman',
'Star3': 'Bob Gunton',
'Star4': 'William Sadler',
'No_of_Votes': 2343110,
'Gross': '28,341,469'}
<br>
#### Calcular IMDB

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import pandas as pd 
import numpy as np

In [2]:
df = pd.read_csv("../data/cinema_processed.csv")

In [3]:
from sklearn.preprocessing import MultiLabelBinarizer

df['Genre_list'] = df['Genre'].str.split(', ')

mlb = MultiLabelBinarizer()
genre_ohe = mlb.fit_transform(df['Genre_list'])

genre_df = pd.DataFrame(
    genre_ohe,
    columns=mlb.classes_,
    index=df.index
)

df = pd.concat([df, genre_df], axis=1)
df = df.drop(columns=['Genre_list'])

In [5]:
df

Unnamed: 0,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,The Godfather,1972,A,175,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,...,0,0,0,0,0,0,0,0,0,0
1,The Dark Knight,2008,UA,152,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,...,0,0,0,0,0,0,0,0,0,0
2,The Godfather: Part II,1974,A,202,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,...,0,0,0,0,0,0,0,0,0,0
3,12 Angry Men,1957,U,96,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,...,0,0,0,0,0,0,0,0,0,0
4,The Lord of the Rings: The Return of the King,2003,U,201,"Action, Adventure, Drama",8.9,Gandalf and Aragorn lead the World of Men agai...,94.0,Peter Jackson,Elijah Wood,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
893,Blowup,1966,A,111,"Drama, Mystery, Thriller",7.6,A fashion photographer unknowingly captures a ...,82.0,Michelangelo Antonioni,David Hemmings,...,0,0,0,1,0,0,0,1,0,0
894,A Hard Day's Night,1964,U,87,"Comedy, Music, Musical",7.6,"Over two ""typical"" days in the life of The Bea...",96.0,Richard Lester,John Lennon,...,0,1,1,0,0,0,0,0,0,0
895,Breakfast at Tiffany's,1961,A,115,"Comedy, Drama, Romance",7.6,A young New York socialite becomes interested ...,76.0,Blake Edwards,Audrey Hepburn,...,0,0,0,0,1,0,0,0,0,0
896,Giant,1956,G,201,"Drama, Western",7.6,Sprawling epic covering the life of a Texas ca...,84.0,George Stevens,Elizabeth Taylor,...,0,0,0,0,0,0,0,0,0,1


In [4]:
features = ["Meta_score", "Runtime", "No_of_Votes", "Gross", "Genre"] 
X = df[features].copy() 
y = df["IMDB_Rating"] 

categorico = ["Genre"] 
numerico = ["Meta_score", "Runtime", "No_of_Votes", "Gross"] 

preprocessamento = ColumnTransformer( 
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorico),
        ("num", "passthrough", numerico) 
        ]
    ) 

model = Pipeline(steps=[ ("preprocessamento", preprocessamento), ("regressor", RandomForestRegressor(n_estimators=300, random_state=42)) ]) 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) 


model.fit(X_train, y_train)
joblib.dump(model, '../src/model/modelo_imdb.pkl')

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:.2f}")

filme = pd.DataFrame([{ "Meta_score": 80.0, "Runtime": 142, "No_of_Votes": 2343110, "Gross": 28341469, "Genre": "Drama" }])

predicao_imdb = model.predict(filme) 
print(f"Nota IMDB prevista: {predicao_imdb[0]:.2f}")




RMSE: 0.22
Nota IMDB prevista: 8.76


#### Resultado

RMSE = “quão confiáveis são minhas previsões em média” → 0.22 indica boa acuracia. <br>
Nota prevista = “o que meu modelo acha que esse filme merece no IMDB” → 8.76 mostra que o modelo aprendeu padrões gerais, mas não captura perfeitamente a nota exata.