In [37]:
# import dependencies
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import string
plt.style.use('ggplot')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\eliot\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
# read in csv
df = pd.read_csv("Resource/IMDB_movies.csv", low_memory=False)
df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000009,Miss Jerry,Miss Jerry,1894,1894-10-09,Romance,45,USA,,Alexander Black,...,"Blanche Bayliss, William Courtenay, Chauncey D...",The adventures of a female reporter in the 1890s.,5.9,154,,,,,1.0,2.0
1,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,589,$ 2250,,,,7.0,7.0
2,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.8,188,,,,,5.0,2.0
3,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,446,$ 45000,,,,25.0,3.0
4,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2237,,,,,31.0,14.0


In [39]:
# clean the data
# extract only the column we want:
# Movie title/Rating/Duration/Release Date/Genres
# quanitfy string
movies_cols = ["imdb_title_id", "title", "year", "genre", "description", "metascore"]
df_movies = df[movies_cols].copy()

print(f"Shape: {str(df_movies.shape)}")
df_movies.head()

Shape: (85855, 6)


Unnamed: 0,imdb_title_id,title,year,genre,description,metascore
0,tt0000009,Miss Jerry,1894,Romance,The adventures of a female reporter in the 1890s.,
1,tt0000574,The Story of the Kelly Gang,1906,"Biography, Crime, Drama",True story of notorious Australian outlaw Ned ...,
2,tt0001892,Den sorte drøm,1911,Drama,Two men of high rank are both wooing the beaut...,
3,tt0002101,Cleopatra,1912,"Drama, History",The fabled queen of Egypt's affair with Roman ...,
4,tt0002130,L'Inferno,1911,"Adventure, Drama, Fantasy",Loosely adapted from Dante's Divine Comedy and...,


In [40]:
df_movies.dropna(inplace=True)

print(f"Shape: {str(df_movies.shape)}")
df_movies.head()

Shape: (13279, 6)


Unnamed: 0,imdb_title_id,title,year,genre,description,metascore
76,tt0006864,Intolerance,1916,"Drama, History","The story of a poor young woman, separated by ...",99.0
506,tt0017136,Metropolis,1927,"Drama, Sci-Fi",In a futuristic city sharply divided between t...,98.0
566,tt0018037,Il cantante di jazz,1927,"Drama, Music, Musical",The son of a Jewish Cantor must defy the tradi...,66.0
628,tt0018773,Il circo,1928,"Comedy, Romance",The Tramp finds work and the girl of his dream...,90.0
714,tt0019777,The Cocoanuts,1929,"Comedy, Musical","During the Florida land boom,",69.0


In [41]:
# take out unique labels [Genres] in the dataset
# use .explode and create a new array for unique genres
df_movies["genres"] = df_movies["genre"].str.split(",")
df_movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,description,metascore,genres
76,tt0006864,Intolerance,1916,"Drama, History","The story of a poor young woman, separated by ...",99.0,"[Drama, History]"
506,tt0017136,Metropolis,1927,"Drama, Sci-Fi",In a futuristic city sharply divided between t...,98.0,"[Drama, Sci-Fi]"
566,tt0018037,Il cantante di jazz,1927,"Drama, Music, Musical",The son of a Jewish Cantor must defy the tradi...,66.0,"[Drama, Music, Musical]"
628,tt0018773,Il circo,1928,"Comedy, Romance",The Tramp finds work and the girl of his dream...,90.0,"[Comedy, Romance]"
714,tt0019777,The Cocoanuts,1929,"Comedy, Musical","During the Florida land boom,",69.0,"[Comedy, Musical]"


In [42]:
# clean description
# using sean's function
# remove stop words

def text_pipeline2(row):
    # genre = row["genre"]
    descr = row["description"]
    # text = genre + " " + descr
    # split into words
    tokens = word_tokenize(descr)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # join the words and return them to be loaded into the dataframe
    return " ".join(words)

In [43]:
 df_movies["description_new"] = df_movies.apply(text_pipeline2, axis=1)
 df_movies.head()

Unnamed: 0,imdb_title_id,title,year,genre,description,metascore,genres,description_new
76,tt0006864,Intolerance,1916,"Drama, History","The story of a poor young woman, separated by ...",99.0,"[Drama, History]",story poor young woman separated prejudice hus...
506,tt0017136,Metropolis,1927,"Drama, Sci-Fi",In a futuristic city sharply divided between t...,98.0,"[Drama, Sci-Fi]",futuristic city sharply divided working class ...
566,tt0018037,Il cantante di jazz,1927,"Drama, Music, Musical",The son of a Jewish Cantor must defy the tradi...,66.0,"[Drama, Music, Musical]",son jewish cantor must defy traditions religio...
628,tt0018773,Il circo,1928,"Comedy, Romance",The Tramp finds work and the girl of his dream...,90.0,"[Comedy, Romance]",tramp finds work girl dreams circus
714,tt0019777,The Cocoanuts,1929,"Comedy, Musical","During the Florida land boom,",69.0,"[Comedy, Musical]",florida land boom


In [44]:
# drop unused cols
df_movies = df_movies.drop(["genre", "description"], axis=1)
df_movies = df_movies.rename(columns={"description_new": "description"})
df_movies.head()

Unnamed: 0,imdb_title_id,title,year,metascore,genres,description
76,tt0006864,Intolerance,1916,99.0,"[Drama, History]",story poor young woman separated prejudice hus...
506,tt0017136,Metropolis,1927,98.0,"[Drama, Sci-Fi]",futuristic city sharply divided working class ...
566,tt0018037,Il cantante di jazz,1927,66.0,"[Drama, Music, Musical]",son jewish cantor must defy traditions religio...
628,tt0018773,Il circo,1928,90.0,"[Comedy, Romance]",tramp finds work girl dreams circus
714,tt0019777,The Cocoanuts,1929,69.0,"[Comedy, Musical]",florida land boom


In [47]:
df_movies.to_csv("Resource/IMDB_eliot2.csv")

In [45]:
# convert text to features using multilabelbinarizer

# from sklearn.preprocessing import MultiLabelBinarizer

# multilabel_binarizer = MultiLabelBinarizer()
# multilabel_binarizer.fit(movies_new['genre_new'])

# # transform target variable
# y = multilabel_binarizer.transform(movies_new['genre_new'])

In [46]:
# split dateset into training and validation set
# X_train, X_test, y_train, y_test = train_test_split(df_movies["X"], y, test_size=0.2, random_state=42)

NameError: name 'train_test_split' is not defined