In [15]:
import pandas as pd
import numpy as np
import json


# Importing files

In [16]:
folder = 'tmdb-box-office-prediction'

trainingData = pd.read_csv(f'{folder}/train.csv')
testData = pd.read_csv(f'{folder}/test.csv')


## Dividing into features and label

In [17]:
dropFields = ['homepage', 'original_title', 'imdb_id',
              'status', 'title', 'tagline', 'poster_path', 'overview', 'belongs_to_collection']
dropFields2 = ['Keywords', 'cast', 'crew', 'spoken_languages', 'production_countries', 'production_companies', 'genres']
trainingData = trainingData.drop(columns=dropFields + dropFields2)
#print(trainingData.size)

In [18]:
trainingData.keys()

Index(['id', 'budget', 'original_language', 'popularity', 'release_date',
       'runtime', 'revenue'],
      dtype='object')

In [19]:
#mean_value =
#trainingData[''].fillna(value=mean_value, inplace=True)

### Converting string to int

In [44]:
trainingData.head(10)
trainingData['original_language'].value_counts()

langauges = ['en', 'ja', 'es', 'no', 'fr', 'ko', 'zh', 'sv', 'cn', 'ru', 'id', 'hi', 'it', 'de', 'ta', 'nl', 'pt', 'fa', 'da', 'ro', 'te', 'hu', 'ml', 'pl', 'fi', 'ur', 'he', 'el', 'ar', 'nb', 'vi', 'mr', 'cs', 'sr', 'bn', 'tr']
languageId = [x for x in range(len(langauges))]

trainingData = trainingData.replace(to_replace=langauges, value=languageId)




# Datacleaning

In [21]:
'''def convertProductComp(field):
    field = field.replace("'", '"')
    data = json.loads(field)
    indexes = [item['id'] for item in data]
    print(indexes)
    return indexes'''

'def convertProductComp(field):\n    field = field.replace("\'", \'"\')\n    data = json.loads(field)\n    indexes = [item[\'id\'] for item in data]\n    print(indexes)\n    return indexes'

In [23]:
# Convert date to milliseconds
from datetime import datetime

for key in trainingData['release_date'].keys():
    try:
        date_string = str(trainingData['release_date'][key])

        date_obj = datetime.strptime(date_string, '%m/%d/%y')
        epoch = datetime(1970, 1, 1)
        time_difference = date_obj - epoch

        milliseconds = int(time_difference.total_seconds() * 1000)
        trainingData = trainingData.replace(trainingData['release_date'][key], milliseconds)
    except:
        trainingData = trainingData.drop([key])



In [25]:
# Replace all NaN budget values with mean value

meanVal = trainingData['budget'].mean()

trainingData['budget'].fillna(meanVal, inplace=True)

# Replace all NaN runtime values with mean value
meanVal = trainingData['runtime'].mean()

trainingData['runtime'].fillna(meanVal, inplace=True)


In [26]:
nan_mask = trainingData.isna()

# Filter the DataFrame to show rows with NaN values
rows_with_nan = trainingData[nan_mask.any(axis=1)]

rows_with_nan

Unnamed: 0,id,budget,original_language,popularity,release_date,runtime,revenue


# ML

In [30]:
try:
    label = trainingData['revenue']
    id = trainingData['id']
except:
    pass

features = trainingData.drop(columns='revenue')

X, y = features, label

In [31]:
# Predicting with the inserted model
def predictWithModel(model):
    model.fit(X_train, y_train)
    return model.predict(X_test)


In [32]:
# Determine the scores based on the model prediction:
def determineScore(y_pred):
    measures = {'mse': mean_squared_error, 'mae' :mean_absolute_error, 'r2':r2_score}
    predsDict = {}
    for key in measures.keys():
        predsDict[key] = measures[key](y_test, y_pred)
    #predsDict['mse'] = mean_squared_error(y_test, y_pred)
    #predsDict['mae'] = mean_absolute_error(y_test, y_pred)
    #predsDict['r2'] = r2_score(y_test, y_pred)

    return predsDict


In [45]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

models = {"Linear reg":LinearRegression(),
          "Random forest":RandomForestRegressor(n_estimators=100),
          "Decition tree":DecisionTreeRegressor(),
          "Bayesian ridge": BayesianRidge()}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


predModelsDict = {}

for key in models.keys():
    predModelsDict[key] = predictWithModel(models[key])







In [77]:
# Printing results from the models

mesureDict = {'mse': [], 'mae': [], 'r2': []}

for key in predModelsDict.keys():
    data = determineScore(predModelsDict[key])
    print(f'\n{key}:')
    for key1 in data.keys():
        print(f"{key1.capitalize()}: {data[key1]}")

        mesureDict[key1].append(data[key1])
        mesureDict[key1].sort()
        if key1.lower() == 'r2':
            mesureDict[key1].sort(reverse=True)









Linear reg:
Mse: 7465725108484834.0
Mae: 44308334.34034504
R2: 0.6804396446987501

Random forest:
Mse: 6490984618128975.0
Mae: 41039956.4451875
R2: 0.7221621047275322

Decition tree:
Mse: 1.0919180211806934e+16
Mae: 55119802.93541667
R2: 0.5326191284329834

Bayesian ridge:
Mse: 8021187133691922.0
Mae: 45325544.97828892
R2: 0.6566638373187281


In [78]:
print()
for key in mesureDict.keys():
    print(f'{key}: \t{mesureDict[key]}')


mse: 	[6490984618128975.0, 7465725108484834.0, 8021187133691922.0, 1.0919180211806934e+16]
mae: 	[41039956.4451875, 44308334.34034504, 45325544.97828892, 55119802.93541667]
r2: 	[0.7221621047275322, 0.6804396446987501, 0.6566638373187281, 0.5326191284329834]


# Deploy