In [1]:
# Import dependencies
import pandas as pd
import numpy as np

## **Loading in CSV**

In [2]:
# Read in raw data of test_df from github
url = 'https://raw.githubusercontent.com/eenabow/oscar_nominations/main/data/oscars2_df.csv'
test_df = pd.read_csv(url, index_col=None)

In [3]:
test_df.head()

Unnamed: 0,movie,duration,metascore,gross,critic_reviews,awards_wins,popularity,awards_nominations,golden_globes_nominated,oscar_nominated
0,Kate & Leopold,118,44.0,47100000.0,125.0,1,2363.0,4,2,1
1,Chicken Run,84,88.0,106790000.0,186.0,5,2859.0,11,1,0
2,Fantastic Four,106,40.0,154700000.0,278.0,0,1876.0,0,0,0
3,Frida,123,61.0,25780000.0,126.0,2,2508.0,12,2,1
4,The Lord of the Rings: The Fellowship of the Ring,178,92.0,313840000.0,296.0,26,204.0,67,4,1


In [4]:
# Will need to remove movie from test_df for actual model 
test_df = test_df[["duration", "metascore", "gross", "critic_reviews", "awards_wins", "popularity", "awards_nominations", "golden_globes_nominated", "oscar_nominated"]]

**XGBoost Machine Learning using GridSearch** 

In [5]:
import xgboost as xgb
from sklearn.model_selection import train_test_split


XGBoostError: XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dylib for Mac OSX, libgomp.so for Linux and other UNIX-like OSes). Mac OSX users: Run `brew install libomp` to install OpenMP runtime.
  * You are running 32-bit Python on a 64-bit OS
Error message(s): ['dlopen(/Users/lhabersham/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib, 6): Library not loaded: /usr/local/opt/libomp/lib/libomp.dylib\n  Referenced from: /Users/lhabersham/opt/anaconda3/lib/python3.8/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: image not found']


In [None]:
# Remove the labels from the features
X_data = test_df.drop('oscar_nominated', axis = 1)

# Labels are the values we want to predict
y = test_df.oscar_nominated

In [None]:
X_data

In [None]:
#Split the dataset into train and Test
seed = 7
test_size = 0.3
X_train, X_test, y_train, y_test = train_test_split(X_data, y, test_size=test_size, random_state=seed)


**MODELS 1 & 2**

In [None]:
#Train the XGboost Model for Classification
model1 = xgb.XGBClassifier(n_jobs= -1)
model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)

train_model1 = model1.fit(X_train, y_train)
train_model2 = model2.fit(X_train, y_train)


In [None]:
#prediction and Classification Report
from sklearn.metrics import classification_report

pred1 = train_model1.predict(X_test)
pred2 = train_model2.predict(X_test)

print('Model 1 XGboost Report %r' % (classification_report(y_test, pred1)))
print('Model 2 XGboost Report %r' % (classification_report(y_test, pred2)))

In [None]:
#Let's use accuracy score
from sklearn.metrics import accuracy_score

print("Accuracy for model 1: %.2f" % (accuracy_score(y_test, pred1) * 100))
print("Accuracy for model 2: %.2f" % (accuracy_score(y_test, pred2) * 100))

**Model 3**

In [None]:
#Let's do a little Gridsearch, Hyperparameter Tunning
model3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [None]:
train_model3 = model3.fit(X_train, y_train)
pred3 = train_model3.predict(X_test)
print("Accuracy for model 3: %.2f" % (accuracy_score(y_test, pred3) * 100))

 **Model 4**

In [None]:
from sklearn.model_selection import GridSearchCV

param_test = {
 'max_depth':[2,10,25],
 'min_child_weight':[2,10,25]
}
gsearch = GridSearchCV(estimator = xgb.XGBClassifier( ), 
 param_grid = param_test, scoring='accuracy',n_jobs=4,iid=False, cv=5, verbose =10)

train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_test)
print("Accuracy for model 4: %.2f" % (accuracy_score(y_test, pred4) * 100))

**Best accuracy was 88.85 from Model 2**

In [None]:
# Save best parameters as a variable from Model 2 
xgb_best= xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)

xgb_best.fit(X_data, y)

In [None]:
# Export the SAV machine learning file 
import joblib

joblib.dump(xgb_best, "xgb_oscars.joblib")

**Visualize best parameters of the model and weights of features**


In [None]:
# Save features columns as a list 
columns_list = list(test_df.columns)[0:-1]
columns_list

In [None]:
# Find the weights of each feature and how they impact the score
feature_imp = pd.Series(train_model2.feature_importances_,index=columns_list).sort_values(ascending=False)
feature_imp

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)

# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()
plt.savefig('xgbmodel2_importantfeatures.png')