If torch is not recognized by pylance, then open a terminal, navigate to C:\Users\[username], and run the following command:

$\boxed{\text{pip install torch}}$

In [6]:
%pip install torch
%pip install pandas

^C
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


Build dataset

In [5]:
import pandas as pd

# load kaggle dataset
df = pd.read_csv('movies_metadata.csv')

  df = pd.read_csv('movies_metadata.csv')


In [6]:
# Perform data cleaning as needed, ex removing NaNs

df.columns
print(df.dtypes)

# Start with the float-typed columns for simplicity sake
df = df[['budget', 'revenue',  'vote_average', 'popularity', 'genres', 'production_companies']]

# ensure budget is a float
df = df[df['budget'].astype(str).str.isnumeric()]
df['budget'] = df['budget'].astype(float)

# ensure popularity is a float
df['popularity'] = df['popularity'].astype(float)

# replace 0 popularity with mean popularity - future: use K-means to estimate popularity
df['popularity']=df.mask(df['popularity']==0, df[df['popularity'] > 0]['popularity'].mean())['popularity']

# keep only datapoints where budget and revenue are recorded
df = df[(df['budget'] > 0) & (df['revenue'] > 0)]

from sklearn.preprocessing import MultiLabelBinarizer
import json
import ast

# get genre column of df
genres_data = df['genres']
# Extract genre names
genre_names = [[genre["name"] for genre in json.loads(str(genre_list).replace("'", "\"")) if "name" in genre_list] for genre_list in genres_data]
# one-hot encode genre names
mlb_genre = MultiLabelBinarizer()
one_hot_encoded_genres = mlb_genre.fit_transform(genre_names)
# replace genres in dataset with one-hot encoded
df= df.drop('genres',axis=1).join(pd.DataFrame(one_hot_encoded_genres)).fillna(0)

# print("Genre names for each position in one-hot vector:")
# print(mlb_genre.classes_)

# Get prod company column
prod_companies_data = df['production_companies']
# Extract prod co names
prod_co_names = [[ast.literal_eval(str(prod_co))["id"] for prod_co in ast.literal_eval(prod_co_list)] for prod_co_list in prod_companies_data]
# one-hot encode prod co names
mlb_prod_co= MultiLabelBinarizer()
one_hot_encoded_prod_co = mlb_prod_co.fit_transform(prod_co_names)
one_hot_encoded_prod_co_df = pd.DataFrame(one_hot_encoded_prod_co)
one_hot_encoded_prod_co_df.columns = range(len(df.columns), len(df.columns) + len(one_hot_encoded_prod_co_df.columns))
# replace prod co in dataset with one-hot encoded
df= df.drop('production_companies',axis=1).join(one_hot_encoded_prod_co_df).fillna(0)


adult                     object
belongs_to_collection     object
budget                    object
genres                    object
homepage                  object
id                        object
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity                object
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                     object
vote_average             float64
vote_count               float64
dtype: object


In [7]:
print(df)

           budget      revenue  vote_average  popularity    0    1    2    3  \
0      30000000.0  373554033.0           7.7   21.946943  0.0  0.0  1.0  1.0   
1      65000000.0  262797249.0           6.9   17.015539  0.0  1.0  0.0  0.0   
3      16000000.0   81452156.0           6.1    3.859495  1.0  0.0  0.0  0.0   
5      60000000.0  187436818.0           7.7   17.924927  1.0  1.0  0.0  0.0   
8      35000000.0   64350171.0           5.5    5.231580  1.0  1.0  0.0  0.0   
...           ...          ...           ...         ...  ...  ...  ...  ...   
45250  12000000.0   19000000.0           6.9    1.323587  0.0  0.0  0.0  0.0   
45399    750000.0          3.0           6.0    0.201582  0.0  0.0  0.0  0.0   
45409    800000.0    1328612.0           5.8    0.903061  0.0  0.0  0.0  0.0   
45412   2000000.0    1268793.0           4.0    0.121844  0.0  0.0  0.0  0.0   
45422   5000000.0    1413000.0           1.0    0.039793  0.0  0.0  0.0  0.0   

         4    5  ...  5492  5493  5494 

In [8]:
# Split into train, val, test
def split_dataset(df, props=[.8, .1, .1]):
    assert round(sum(props), 2) == 1 and len(props) >= 2

    train_df = df.iloc[0:int(props[0]*df.shape[0])]
    val_df= df.iloc[int(props[0]*df.shape[0]):int((props[0] + props[1])*df.shape[0])]
    test_df = df.iloc[int((props[0] + props[1])*df.shape[0]):df.shape[0]]

    return train_df, val_df, test_df

In [9]:

# Set up dataset iterators, perform feature engineering

import numpy as np
import torch
from torch.utils.data import Dataset
from scipy import signal
from tqdm import tqdm
from torch.utils.data import RandomSampler



class MovieDataset(Dataset):
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index: int):
        features = self.df.iloc[index]
        processed_features = features.drop('revenue')


        # Return the processed features and the room size
        revenue = self.df.iloc[index]['revenue']
        return torch.tensor(np.array(processed_features, np.float32)), revenue


train_df, val_df, test_df = split_dataset(df)

train_dataset = MovieDataset(train_df)
val_dataset = MovieDataset(val_df)
test_dataset = MovieDataset(test_df)


train_sampler = RandomSampler(train_dataset)
val_sampler   = RandomSampler(val_dataset)
test_sampler  = RandomSampler(test_dataset)

from torch.utils.data import DataLoader
BATCH_SIZE = 1

train_iterator = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler) #collate_fn = ...
val_iterator   = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler)
test_iterator  = DataLoader(test_dataset, batch_size=BATCH_SIZE, sampler=test_sampler)

In [10]:
import torch
import random, sys

RANDOM_SEED = 30
torch.manual_seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(sys.version)

3.12.7 | packaged by Anaconda, Inc. | (main, Oct  4 2024, 13:17:27) [MSC v.1929 64 bit (AMD64)]


In [11]:
import torch.nn as nn

class MovieNN(nn.Module):
  def __init__(self):
    super().__init__()
    self.lin1 = nn.Linear(5500, 5500)
    self.lin2 = nn.Linear(5500, 5500)
    self.lin3 = nn.Linear(5500,1)

  def forward(self, x):
    x=x.to(device) # put it on correct processor
    lin1 = self.lin1(x)
    lin2 = self.lin2(lin1)
    lin3 = self.lin3(lin2)
    return torch.flatten(lin3)



In [12]:
model = MovieNN().to(device)

In [13]:
from torch.optim import AdamW
criterion, optimizer = None, None
criterion = nn.MSELoss()
lrate = .00001
optimizer = AdamW(model.parameters(), lr=lrate)

In [14]:
def train_loop(model, criterion, optim, iterator):
  """
  Returns the total loss calculated from criterion
  """
  model.train()
  total_loss = 0
  for x, y in tqdm(iterator):
    y = torch.tensor(np.array(y, np.float32)).to(device)
    optim.zero_grad()

    yhat = model(x)
    loss = criterion(yhat, y)
    total_loss += loss.item()
    loss.backward()
    optim.step()

  return total_loss


def val_loop(model, iterator):
  """
  Returns:
    true (List[bool]): All the ground truth values taken from the dataset iterator
    pred (List[bool]): All model predictions.
  """
  true, pred = [], []
  for example in iterator:
    # print(example[1])
    # print(example[1], model(example[0]))
    true.extend(example[1])
    #print(example[0])
    pred.extend(model(example[0]))
  pred = torch.tensor(pred)
  pred.to(device)
  return true, pred

In [15]:
def accuracy(true, pred):
  count = len(true)
  good = len([i for i in range(len(true)) if abs(true[i] - pred[i]) < .15*true[i]])
  return good/count

In [16]:
TOTAL_EPOCHS = 20

for epoch in range(TOTAL_EPOCHS):
    train_loss = train_loop(model, criterion, optimizer, train_iterator)
    true, pred = val_loop(model, val_iterator)
    print(f"EPOCH: {epoch}")
    print(f"TRAIN LOSS: {train_loss}")
    print(f"VAL ACC: {accuracy(true, pred)}")

100%|██████████| 4304/4304 [33:21<00:00,  2.15it/s]


EPOCH: 0
TRAIN LOSS: 5.716608883761341e+19
VAL ACC: 0.1171003717472119


  5%|▍         | 203/4304 [01:08<22:54,  2.98it/s]


KeyboardInterrupt: 

In [None]:
true, pred = val_loop(model, val_iterator)
true2, pred2 = val_loop(model, test_iterator)
print(model.parameters)
print(list(zip(true,pred)))
print(f"EPOCH: {epoch}")
print(f"TRAIN LOSS: {train_loss}")
print(f"VAL ACC: {accuracy(true, pred)}")
print(f"TEST ACC: {accuracy(true2, pred2)}")

<bound method Module.parameters of MovieNN(
  (lin1): Linear(in_features=23, out_features=23, bias=True)
  (lin2): Linear(in_features=23, out_features=1, bias=True)
)>
[(tensor(1200000., dtype=torch.float64), tensor(12072523.)), (tensor(12000000., dtype=torch.float64), tensor(30476406.)), (tensor(30283., dtype=torch.float64), tensor(27705838.)), (tensor(7847000., dtype=torch.float64), tensor(30476444.)), (tensor(48390., dtype=torch.float64), tensor(5541205.5000)), (tensor(29789000., dtype=torch.float64), tensor(47099896.)), (tensor(1.8399e+08, dtype=torch.float64), tensor(4.8762e+08)), (tensor(3621046., dtype=torch.float64), tensor(38788184.)), (tensor(71561644., dtype=torch.float64), tensor(1.3853e+08)), (tensor(27330000., dtype=torch.float64), tensor(96970336.)), (tensor(75143., dtype=torch.float64), tensor(13852956.)), (tensor(16178959., dtype=torch.float64), tensor(2.4935e+08)), (tensor(143101., dtype=torch.float64), tensor(11082365.)), (tensor(3000000., dtype=torch.float64), tenso