In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import StandardScaler
import pandas as pd
import random
import gensim
from gensim.models import Word2Vec

Ingesting and Formatting Data

In [2]:
def read_data_picture_noms(year):
    data=pd.read_csv("BestPictureNominations.csv")
    data=data.drop(labels=['Win', 'OscarNoms', 'DirectingNom', 'ActingNom', 'ScreenplayNom', 'EditingNom'], axis=1)
    
    if year != 2024:
        data=data[data['Year']!=2024]

    train=data[data['Year']!=year]
    y_train=train['Best Picture Nominee']
    X_train=train.drop(labels=['Best Picture Nominee', 'Title', "Year"], axis=1)

    test=data[data['Year']==year]
    y_test=test['Best Picture Nominee']
    X_test=test.drop(labels=['Best Picture Nominee', "Title", "Year"], axis=1)

    scaler = StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.fit_transform(X_test)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(np.array(y_train), dtype=torch.float32).reshape(-1, 1)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(np.array(y_test), dtype=torch.float32).reshape(-1, 1)

    return X_train, y_train, X_test, y_test

def read_data_picture_win(year):
    data=pd.read_csv("BestPictureNominations.csv")
    data=data[data['Best Picture Nominee']==1]

    if year != 2024:
        data=data[data['Year']!=2024]


    train=data[data['Year']!=year]


    y_train=train['Win']
    X_train=train.drop(labels=['Title', "Year", 'Win'], axis=1)

    test=data[data['Year']==year]
    y_test=test['Win']
    #X_test_titles=test['Title']
    X_test=test.drop(labels=["Title", "Year", "Win"], axis=1)

    scaler = StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.fit_transform(X_test)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(np.array(y_train), dtype=torch.float32).reshape(-1, 1)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(np.array(y_test), dtype=torch.float32).reshape(-1, 1)

    return X_train, y_train, X_test, y_test

In [3]:
def read_data_general_noms(year, join_with):
    data=pd.read_csv(join_with)
    #data=data.merge(pd.read_csv(join_with), how='right', on=['Title', 'Year']).fillna(0)
    data=data.drop(labels='Win_Category', axis=1)
    
    if year != 2024:
        data=data[data['Year']!=2024]

    train=data[data['Year']!=year]
    y_train=train['Nom_Category']
    X_train=train.drop(labels=['Nom_Category', 'Name','Title', "Year"], axis=1)

    test=data[data['Year']==year]
    y_test=test['Nom_Category']
    X_test=test.drop(labels=['Nom_Category', 'Name', "Title", "Year"], axis=1)

    scaler = StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.fit_transform(X_test)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(np.array(y_train), dtype=torch.float32).reshape(-1, 1)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(np.array(y_test), dtype=torch.float32).reshape(-1, 1)

    return X_train, y_train, X_test, y_test

def read_data_general_win(year, join_with):
    data=pd.read_csv(join_with)
    #data=data.merge(pd.read_csv(join_with), how='right', on=['Title', 'Year']).fillna(0)
    #data=data[data['Best Picture Nominee']==1]

    if year != 2024:
        data=data[data['Year']!=2024]

    train=data[data['Year']!=year]
    y_train=train['Win_Category']
    X_train=train.drop(labels=['Title', 'Name', "Year", 'Win_Category'], axis=1)

    test=data[data['Year']==year]
    y_test=test['Win_Category']
    #X_test_titles=test['Title']
    X_test=test.drop(labels=["Title", 'Name', "Year", "Win_Category"], axis=1)

    scaler = StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.fit_transform(X_test)

    X_train = torch.tensor(X_train, dtype=torch.float32)
    y_train = torch.tensor(np.array(y_train), dtype=torch.float32).reshape(-1, 1)

    X_test = torch.tensor(X_test, dtype=torch.float32)
    y_test = torch.tensor(np.array(y_test), dtype=torch.float32).reshape(-1, 1)

    return X_train, y_train, X_test, y_test

Getting a List of Titles

In [4]:
def read__picture_titles(year):
    data=pd.read_csv("BestPictureNominations.csv")
    data = data[data['Year']==year]
    data=data[data['Best Picture Nominee']==1]
    titles = data['Title']
    return titles

def read_general_titles(year, file):
    data=pd.read_csv(file)
    data= data[data['Year']==year]
    titles = data['Name']
    return titles

Building and Evaluating Model

In [5]:
def build_model(X_train, y_train, n_epochs=100, batch_size=10, lr=0.001):
    model = nn.Sequential(
    nn.Linear(X_train.size()[1], X_train.size()[1]*2),
    nn.ReLU(),
    nn.Linear(X_train.size()[1]*2, X_train.size()[1]*2),
    nn.ReLU(),
    
    nn.Linear(X_train.size()[1]*2, X_train.size()[1]),
    nn.ReLU(),
    nn.Linear(X_train.size()[1], 1),
    nn.Sigmoid()
    )
    loss_fn = nn.BCELoss()  # binary cross entropy
    #def loss_fn(y_pred, y_train):
    #    l1 =  nn.BCELoss()
    #    l2 = 0.005 * abs(10 - sum(y_pred))
    #    return l1(y_pred, y_train) + l2
    optimizer = optim.Adam(model.parameters(), lr)
 
    for epoch in range(n_epochs):
        #for i in range(0, len(X_train), batch_size):
        #Xbatch = X_train[i:i+batch_size]
        y_pred = model(X_train)
        #ybatch = y_train[i:i+batch_size]
        loss = loss_fn(y_pred, y_train)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    #print(f'Finished epoch {epoch}, latest loss {loss}')
    return model

In [6]:
def evaluate_model(model, X_test, y_test):
    with torch.no_grad():
        y_pred = model(X_test)
    accuracy = (y_pred.round() == y_test).float().mean()
    return y_pred, accuracy

In [7]:
def print_output_joint(y_pred_nom, y_pred_win, titles):
    temp=pd.DataFrame()
    y_pred_nom=[100*float(i) for i in y_pred_nom]
    y_pred_win = [100*float(i/sum(y_pred_win)) for i in y_pred_win]
    temp['Title']=titles
    temp['Probability Nom']=y_pred_nom
    temp['Probability Win']=y_pred_win
    return temp.sort_values(by=['Probability Nom'], axis=0, ascending=False)
    

def print_output_win(y_pred, titles):
    temp=pd.DataFrame()
    y_pred = [100*float(i/sum(y_pred)) for i in y_pred]
    temp['Title']=titles
    temp['Probability Win']=y_pred
    return temp.sort_values(by=['Probability Win'], axis=0, ascending=False)
    

In [8]:
def generate_picture(year,n_epochs=500, batch_size=10, lr=0.001):
    pd.options.display.float_format = '{:.4f}'.format

    #X_train_nom, y_train_nom, X_test_nom, y_test_nom = read_data_picture_noms(year)
    X_train_win, y_train_win, X_test_win, y_test_win = read_data_picture_win(year)
    titles = read__picture_titles(year)

    #model_noms = build_model(X_train_nom, y_train_nom, n_epochs, batch_size, lr)
    model_wins = build_model(X_train_win, y_train_win, n_epochs, batch_size, lr)

    #y_pred_nom, accuracy_nom = evaluate_model(model_noms, X_test_nom, y_test_nom)
    y_pred_win, accuracy_win = evaluate_model(model_wins, X_test_win, y_test_win)

    return print_output_win(y_pred_win, titles)


In [9]:
year=2024
file_name = 'Actor.csv'
pd.options.display.float_format = '{:.4f}'.format

X_train_nom, y_train_nom, X_test_nom, y_test_nom = read_data_general_noms(year, file_name)
X_train_win, y_train_win, X_test_win, y_test_win = read_data_general_win(year, file_name)
titles = read_general_titles(year, file_name)

model_noms = build_model(X_train_nom, y_train_nom, n_epochs=100)
model_wins = build_model(X_train_win, y_train_win, n_epochs=100)

y_pred_nom, accuracy_nom = evaluate_model(model_noms, X_test_nom, y_test_nom)
y_pred_win, accuracy_win = evaluate_model(model_wins, X_test_win, y_test_win)

print_output_joint(y_pred_nom, y_pred_win, titles)

Unnamed: 0,Title,Probability Nom,Probability Win
3,Timothee Chalamet,98.5408,6.6674
1,Adrien Brody,96.0321,1.3064
0,Ralph Fiennes,94.793,12.9674
2,Colman Domingo,94.3317,2.5953
5,Daniel Craig,91.6716,2.9861
13,Hugh Grant,65.5662,10.1144
6,Sebastian Stan (Trump),59.5089,12.7921
14,Gabriel LaBelle,59.5089,12.7921
8,Sebastian Stan (Different Man),16.2125,10.5423
9,Jesse Eisenberg,6.3408,5.1975


In [10]:
fin_avg = []

for i in range(20):
    torch.manual_seed(i)
    a=generate_picture(2024)
    fin_avg.append(a)
final = pd.concat(fin_avg)
final = final.groupby('Title').mean().reset_index().sort_values(by=['Probability Win'], axis=0, ascending=False)
final


Unnamed: 0,Title,Probability Win
1,Anora,99.5705
9,Wicked,0.2981
2,Conclave,0.0911
4,Emilia Perez,0.0146
7,The Brutalist,0.0137
8,The Substance,0.0096
0,A Complete Unknown,0.0012
6,Nickel Boys,0.0008
3,Dune: Part Two,0.0002
5,I'm Still Here,0.0002


In [11]:
from bs4 import BeautifulSoup
import requests

response = requests.get('https://letterboxd.com/film/the-matrix/')

html_soup = BeautifulSoup(response.text)

html_soup.find_all("p")[8]

<p class="credits">
<span class="introduction">Directed by </span><span class="directorlist"><a class="contributor" href="/director/lana-wachowski/"><span class="prettify">Lana Wachowski</span></a>, <a class="contributor" href="/director/lilly-wachowski/"><span class="prettify">Lilly Wachowski</span></a>
</span>
</p>