In [1]:
# Connect to Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [44]:
# do all of the imports
import pandas as pd
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
import joblib

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import string
plt.style.use('ggplot')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [27]:
# Utility Function for history plotting
# adapted from a RealPython tutorial: 
#    https://realpython.com/python-keras-text-classification/
def plot_history(history, file_name):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.savefig(file_name, bbox_inches='tight')

In [28]:
# Utility Function for data cleaning pipeline
def text_pipeline(row):
    genre = row["genre"]
    descr = row["description"]
    text = genre + " " + descr
    # split into words
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # join the words and return them to be loaded into the dataframe
    return " ".join(words)

In [8]:
!ls gdrive/MyDrive/Colab\ Notebooks/Final-Project/Resource/
!pwd

IMDb_movies.csv  IMDB_sean.csv
/content


In [29]:
# Load in the IMDB_sean file
df = pd.read_csv("gdrive/MyDrive/Colab Notebooks/Final-Project/Resource/IMDB_sean.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,title,year,genre,description,budget,usa_gross_income,worlwide_gross_income,profit_factor
0,165,Il gabinetto del dottor Caligari,1920,"Fantasy, Horror, Mystery","Hypnotist Dr. Caligari uses a somnambulist, Ce...",18000.0,8811.0,8811.0,loss
1,245,I quattro cavalieri dell'Apocalisse,1921,"Drama, Romance, War",An extended family split up in France and Germ...,800000.0,9183673.0,9183673.0,large gain
2,506,Metropolis,1927,"Drama, Sci-Fi",In a futuristic city sharply divided between t...,6000000.0,1236166.0,1349711.0,loss
3,1048,Luci della città,1931,"Comedy, Drama, Romance","With the aid of a wealthy erratic tippler, a d...",1500000.0,19181.0,46008.0,loss
4,2454,Tempi moderni,1936,"Comedy, Drama, Family",The Tramp struggles to live in modern industri...,1500000.0,163577.0,457688.0,loss


In [30]:
 # Run the Genre and Description thru the text pipeline for joining and cleaning
 df["words"] = df.apply(text_pipeline, axis=1)
 df.head()
 print(df["words"].map(lambda x: len(x)).max())

315


In [36]:
# Define the X and y parameters
sentences = df["words"].values
y = df["profit_factor"].values

sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=42)

vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)

# scale the data with StandardScaler
X_scaler = StandardScaler(with_mean=False).fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Label Encode data set
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

In [47]:
# Run Logistic Regression
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_scaled, encoded_y_train)
score = classifier.score(X_test_scaled, encoded_y_test)

print("Accuracy:", score)
print(f"Training Data Score: {classifier.score(X_train_scaled, encoded_y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, encoded_y_test)}")

Accuracy: 0.28205128205128205
Training Data Score: 0.9986844269034698
Testing Data Score: 0.28205128205128205


In [41]:
# Hyperparameter Tuning
param_grid = {'C': [10, 50, 100],
              'max_iter': [200, 500, 1000]}
grid = GridSearchCV(classifier, param_grid, verbose=3)
# Train the model with GridSearch
grid.fit(X_train_scaled, encoded_y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV] C=10, max_iter=200 ..............................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s remaining:    0.0s


[CV] .................. C=10, max_iter=200, score=0.292, total=   3.3s
[CV] C=10, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.5s remaining:    0.0s


[CV] .................. C=10, max_iter=200, score=0.278, total=   3.2s
[CV] C=10, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=10, max_iter=200, score=0.295, total=   2.7s
[CV] C=10, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=10, max_iter=200, score=0.249, total=   3.0s
[CV] C=10, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=10, max_iter=200, score=0.262, total=   3.1s
[CV] C=10, max_iter=500 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=10, max_iter=500, score=0.295, total=   7.4s
[CV] C=10, max_iter=500 ..............................................
[CV] .................. C=10, max_iter=500, score=0.276, total=   6.8s
[CV] C=10, max_iter=500 ..............................................
[CV] .................. C=10, max_iter=500, score=0.295, total=   4.4s
[CV] C=10, max_iter=500 ..............................................
[CV] .................. C=10, max_iter=500, score=0.248, total=   7.5s
[CV] C=10, max_iter=500 ..............................................
[CV] .................. C=10, max_iter=500, score=0.262, total=   3.7s
[CV] C=10, max_iter=1000 .............................................
[CV] ................. C=10, max_iter=1000, score=0.295, total=   8.0s
[CV] C=10, max_iter=1000 .............................................
[CV] ................. C=10, max_iter=1000, score=0.276, total=   6.5s
[CV] C=10, max_iter=1000 .............................................
[CV] .

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=200, score=0.281, total=   2.9s
[CV] C=50, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=200, score=0.269, total=   2.8s
[CV] C=50, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=200, score=0.295, total=   3.0s
[CV] C=50, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=200, score=0.251, total=   3.0s
[CV] C=50, max_iter=200 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=200, score=0.259, total=   3.0s
[CV] C=50, max_iter=500 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=500, score=0.280, total=   7.4s
[CV] C=50, max_iter=500 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=500, score=0.266, total=   7.3s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.293, total=   6.1s
[CV] C=50, max_iter=500 ..............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] .................. C=50, max_iter=500, score=0.252, total=   7.9s
[CV] C=50, max_iter=500 ..............................................
[CV] .................. C=50, max_iter=500, score=0.260, total=   4.5s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.283, total=   8.8s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.266, total=   9.5s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.293, total=   6.4s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.252, total=   8.8s
[CV] C=50, max_iter=1000 .............................................
[CV] ................. C=50, max_iter=1000, score=0.260, total=   4.3s
[CV] C=100, max_iter=200 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=200, score=0.279, total=   3.1s
[CV] C=100, max_iter=200 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=200, score=0.272, total=   3.2s
[CV] C=100, max_iter=200 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=200, score=0.294, total=   3.0s
[CV] C=100, max_iter=200 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=200, score=0.252, total=   3.3s
[CV] C=100, max_iter=200 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=200, score=0.255, total=   3.0s
[CV] C=100, max_iter=500 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=500, score=0.282, total=   7.3s
[CV] C=100, max_iter=500 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=500, score=0.271, total=   7.6s
[CV] C=100, max_iter=500 .............................................
[CV] ................. C=100, max_iter=500, score=0.295, total=   7.4s
[CV] C=100, max_iter=500 .............................................


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[CV] ................. C=100, max_iter=500, score=0.252, total=   7.6s
[CV] C=100, max_iter=500 .............................................
[CV] ................. C=100, max_iter=500, score=0.255, total=   6.9s
[CV] C=100, max_iter=1000 ............................................
[CV] ................ C=100, max_iter=1000, score=0.281, total=  13.5s
[CV] C=100, max_iter=1000 ............................................
[CV] ................ C=100, max_iter=1000, score=0.271, total=  11.9s
[CV] C=100, max_iter=1000 ............................................
[CV] ................ C=100, max_iter=1000, score=0.295, total=   7.1s
[CV] C=100, max_iter=1000 ............................................
[CV] ................ C=100, max_iter=1000, score=0.253, total=   9.9s
[CV] C=100, max_iter=1000 ............................................
[CV] ................ C=100, max_iter=1000, score=0.255, total=   6.6s


[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:  4.4min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [10, 50, 100], 'max_iter': [200, 500, 1000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [42]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 10, 'max_iter': 200}
0.2751164965618648


In [46]:
# Save the Model
file_name = "gdrive/MyDrive/Colab Notebooks/Final-Project/Saved_Models/sean_Logistic_Regression.sav"
joblib.dump(classifier, file_name)

['gdrive/MyDrive/Colab Notebooks/Final-Project/Saved_Models/sean_Logistic_Regression.sav']