In [1]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
import numpy as np
import os

In [21]:
# Import the data 
movies = pd.read_csv("../Resources/imdb_final.csv")
movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,total_votes,...,allover45,males,males18to29,males30to44,malesover45,females,females18to29,females30to44,femalesover45,rating_class
0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,75298,...,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7,Good
1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,1082,...,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6,Good
2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,20959,...,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5,Bad
3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,1588,...,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6,Good
4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,3852,...,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6,Good


In [22]:
target = movies["rating_class"]
target_names = ["Bad", "Excellent", "Good"]

In [23]:
# Step 0: Reformat data
data = movies.values
X = data[:, 0:22]
y = data[:, 22]

In [24]:
# Step 1: Label-encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

In [26]:
for label, rating_class in zip(encoded_y, y):
    print('Rating Class: ' + str(rating_class))
    print('Encoded Label: ' + str(label))
    print('-' * 12)

Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Bad
Encoded Label: 0
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Excellent
Encoded Label: 1
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Excellent
Encoded Label: 1
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Excellent
Encoded Label: 1
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating Class: Good
Encoded Label: 2
------------
Rating

In [27]:
from keras.utils import to_categorical

# Step 2: One-hot encoding
one_hot_y = to_categorical(encoded_y)
one_hot_y

Using TensorFlow backend.


array([[0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.]], dtype=float32)

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(one_hot_y, target, random_state=42)

In [39]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [49]:
# Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)
model.get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [41]:
model.get_params().keys()

dict_keys(['C', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [42]:
# Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.0001 ...............................................
[CV] ................... C=1, gamma=0.0001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.001 ................................................
[CV] .................... C=1, gamma=0.001, score=1.000, total=   0.0s
[CV] C=1, gamma=0.01 .................................................
[CV] ............

[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:    0.3s finished


GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto_deprecated', kernel='linear',
                           max_iter=-1, probability=False, random_state=None,
                           shrinking=True, tol=0.001, verbose=False),
             iid='warn', n_jobs=None,
             param_grid={'C': [1, 5, 10], 'gamma': [0.0001, 0.001, 0.01]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [43]:
 # List the best parameters for this dataset
print(grid.best_params_)

{'C': 1, 'gamma': 0.0001}


In [46]:
 # List the best score
print(grid.best_score_)

1.0


In [47]:
 # Make predictions with the hypertuned model
predictions = grid.predict(X_test)

In [48]:
 # Calculate classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions,
                            target_names=["Bad", "Excellent", "Good"]))

              precision    recall  f1-score   support

         Bad       1.00      1.00      1.00       280
   Excellent       1.00      1.00      1.00       161
        Good       1.00      1.00      1.00       824

    accuracy                           1.00      1265
   macro avg       1.00      1.00      1.00      1265
weighted avg       1.00      1.00      1.00      1265



In [15]:
data = movies.drop("rating_class", axis=1)
feature_names = data.columns
data.head()

Unnamed: 0,imdb_title_id,title,year,genre,duration,country,director,production_company,budget,total_votes,...,all30to44,allover45,males,males18to29,males30to44,malesover45,females,females18to29,females30to44,femalesover45
0,tt0035423,Kate & Leopold,2001,Comedy,118,USA,James Mangold,Konrad Pictures,48000000,75298,...,6.3,6.4,6.3,6.5,6.2,6.3,6.6,6.7,6.4,6.7
1,tt0113026,The Fantasticks,2000,Musical,86,USA,Michael Ritchie,Michael Ritchie Productions,10000000,1082,...,5.5,5.4,5.3,5.4,5.4,5.3,5.7,5.6,5.8,5.6
2,tt0118589,Glitter,2001,Drama,104,USA,Vondie Curtis-Hall,Twentieth Century Fox,22000000,20959,...,2.1,2.1,1.9,2.1,1.9,2.0,2.9,3.2,2.7,2.5
3,tt0118652,The Attic Expeditions,2001,Comedy,100,USA,Jeremy Kasten,Tse Tse Fly Productions,1000000,1588,...,5.1,4.6,5.0,4.7,5.1,4.6,4.8,5.8,4.6,4.6
4,tt0120467,Vulgar,2000,Crime,87,USA,Bryan Johnson,Chango Productions,120000,3852,...,5.3,5.0,5.3,6.1,5.3,5.1,5.2,5.2,5.3,4.6


In [4]:
label_encoder = LabelEncoder()
label_encoder.fit(target)

LabelEncoder()

In [5]:
setofdata = movies[["year", "duration", "genre", "director", "budget", "rating_class"]]
moviesdf = pd.DataFrame(setofdata)
print (moviesdf)

      year  duration    genre            director    budget rating_class
0     2001       118   Comedy       James Mangold  48000000         Good
1     2000        86  Musical     Michael Ritchie  10000000         Good
2     2001       104    Drama  Vondie Curtis-Hall  22000000          Bad
3     2001       100   Comedy       Jeremy Kasten   1000000         Good
4     2000        87    Crime       Bryan Johnson    120000         Good
...    ...       ...      ...                 ...       ...          ...
5055  2019        84   Comedy           Jon Lucas   5000000    Excellent
5056  2019        94    Drama         Dan Sallitt     95000         Good
5057  2019        84   Action        Glenn Miller    100000          Bad
5058  2019        92   Action        Keoni Waxman   3000000         Good
5059  2019        77   Horror    Robin Entreinger    500000          Bad

[5060 rows x 6 columns]


In [7]:
setofdata = movies[["year", "duration", "genre", "director", "budget", "rating_class"]]
feature_names = setofdata.columns
setofdata.head()

Unnamed: 0,year,duration,genre,director,budget,rating_class
0,2001,118,Comedy,James Mangold,48000000,Good
1,2000,86,Musical,Michael Ritchie,10000000,Good
2,2001,104,Drama,Vondie Curtis-Hall,22000000,Bad
3,2001,100,Comedy,Jeremy Kasten,1000000,Good
4,2000,87,Crime,Bryan Johnson,120000,Good


In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(setofdata, target, random_state=42)

In [10]:
# Support vector machine linear classifier
from sklearn.svm import SVC 
model = SVC(kernel='linear')

In [11]:
 # Create the GridSearch estimator along with a parameter object containing the values to adjust
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10],
              'gamma': [0.0001, 0.001, 0.01]}
grid = GridSearchCV(model, param_grid, verbose=3)

In [12]:
 # Fit the model using the grid search estimator. 
# This will take the SVC model and try each combination of parameters
grid.fit(X_train, y_train)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV] C=1, gamma=0.0001 ...............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ValueError: could not convert string to float: 'Good'