In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import nltk
import nltk.sentiment
import re

import warnings
warnings.filterwarnings('ignore')

In [2]:
plt.rc('figure', figsize=(13, 7))
plt.rc('font', size=14)
plt.style.use('seaborn-darkgrid')

In [3]:
movie_title = pd.read_csv("IMDb movies.csv")

In [4]:
# What is are biggest markets in terms of movie production? 

movie_title.groupby("country").title.count().sort_values(ascending=False).nlargest(5)

country
USA       27490
India      5540
UK         3869
France     2975
Japan      2850
Name: title, dtype: int64

In [5]:
# What year did most movies get published?

movie_title.groupby("date_published").title.count().nlargest(10)

date_published
2010    111
1999    105
2008    101
1997     92
1985     90
1996     89
2009     89
1989     85
2011     85
1987     84
Name: title, dtype: int64

In [6]:
# What data range do we have? 

movie_title.date_published.min(), movie_title.date_published.max()

('1906-12-26', '2020-05-22')

In [7]:
ADDITIONAL_STOPWORDS = ['r', 'u', '2', 'ltgt']

def clean(text: str) -> list:
    'A simple function to cleanup text data'
    wnl = nltk.stem.WordNetLemmatizer()
    stopwords = nltk.corpus.stopwords.words('english') + ADDITIONAL_STOPWORDS
    text = (text.encode('ascii', 'ignore')
             .decode('utf-8', 'ignore')
             .lower())
    words = re.sub(r'[^\w\s]', '', text).split() # tokenization
    return [wnl.lemmatize(word) for word in words if word not in stopwords]

For the purposes of this project, we will only look to use movies from the US. If the opportunity to scale up the project is possible, then we will add additional countries.

In [8]:
df = movie_title[(movie_title.country == "USA")]

In [9]:
# we see that we have a couple of missing values.
df.isnull().sum()

imdb_title_id                0
title                        0
original_title               0
year                         0
date_published               0
genre                        0
duration                     0
country                      0
language                   307
director                    28
writer                     191
production_company        1139
actors                      22
description                 76
avg_vote                     0
votes                        0
budget                   17213
usa_gross_income         19940
worlwide_gross_income    19710
metascore                21145
reviews_from_users         279
reviews_from_critics      1728
dtype: int64

**Steps for Removing Null Values**:

1. The movie description will be a large part indicator or similarity, and as such, I want movies that have a description. I will drop any null values in this column
1. I will explore that language column, as I suspect at this moment that these shoud be all inglish
1. Similarly to the description, the director would play a big influence and as such I am thinking of dropping all null values from this column
1. While the meta score value would be really useful - there are too many missing values, and as such I will drop this column. Same for the `reviews_from_critics`
1. Might be able to impude a value for the missing `reviews_from_users`
1. I might be able to find the missing writers from the other IMBD file


In [10]:
# Given that hte majority of the movies are exclusively English, and we know that we are only looking at movies
# made in the US, I can speculate that it is very likely these movies were at least partially made in english
# As such, I will impode the missing values

df.language.value_counts()

English                                              23976
English, Spanish                                       695
English, French                                        440
English, German                                        198
English, Italian                                       195
                                                     ...  
English, Japanese, Yiddish, German                       1
English, Greek, Japanese                                 1
English, French, Latin, Scottish Gaelic, Italian         1
English, French, Turkish, Hebrew, Arabic, Spanish        1
English, French, German, Arabic                          1
Name: language, Length: 629, dtype: int64

In [11]:
df.language = df.language.fillna("English")

In [12]:
# remove nulls from director and description

df = df[df.description.notnull()]

In [13]:
df = df[df.director.notnull()]

In [14]:
df = df[df.writer.notnull()]

In [15]:
# What is the avg budget for all movies so that we can use it to impude values?

df.budget = df.budget.fillna("$ 0")

In [16]:
df = df[~df.budget.str.contains("ESP")]

df = df[~df.budget.str.contains("GBP")]

df = df[~df.budget.str.contains("CAD")]

df = df[~df.budget.str.contains("PYG")]

df = df[~df.budget.str.contains("AUD")]

df = df[~df.budget.str.contains("EUR")]

df = df[~df.budget.str.contains("RUR")]

In [17]:
avg_budget = df.budget.str.replace("$", '').astype(int).mean()

In [18]:
df.budget = df.budget.str.replace("$", '').astype(int)

In [19]:
df.budget = df.budget.replace(0, avg_budget)

In [20]:
# We will do something similar for US gross income

median_income = df[(df.usa_gross_income.notnull()) & (df.usa_gross_income.str.contains("$", regex=False))].usa_gross_income.str.replace("$", '').astype(int).median()

In [21]:
df.usa_gross_income = df.usa_gross_income.fillna("$ 0")

In [23]:
df.usa_gross_income = (
    df[df.usa_gross_income.str.contains("$", regex=False)]
    .usa_gross_income.str.replace("$", '')
    .astype(int)
    .replace(0, median_income)
)

In [33]:
# Remove columns with too many null values

df = df.drop(columns=["worlwide_gross_income", "metascore", "reviews_from_users", "reviews_from_critics"])

In [36]:
# Drop any remaining null values

df = df.dropna()

In [40]:
# Now we have no null values, and still have over 26000 movie titles

df.isnull().sum(), df.shape

(imdb_title_id         0
 title                 0
 original_title        0
 year                  0
 date_published        0
 genre                 0
 duration              0
 country               0
 language              0
 director              0
 writer                0
 production_company    0
 actors                0
 description           0
 avg_vote              0
 votes                 0
 budget                0
 usa_gross_income      0
 dtype: int64, (26102, 18))

## NLP Prep

In [43]:
pd.Series(clean(' '.join(df.description)))

0            fabled
1             queen
2             egypt
3            affair
4             roman
            ...    
402948       narrow
402949         path
402950      distant
402951    celestial
402952         city
Length: 402953, dtype: object

In [44]:
df

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,4.500000e+04,5548375.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,4.784118e+06,5548375.0
15,tt0003167,"Home, Sweet Home","Home, Sweet Home",1914,1914-05-17,Drama,55,USA,English,D.W. Griffith,"D.W. Griffith, H.E. Aitken",Majestic Motion Picture Company,"Henry B. Walthall, Josephine Crowell, Lillian ...",John Howard Payne at his most miserable point ...,5.7,170,4.784118e+06,5548375.0
20,tt0003643,The Avenging Conscience: or 'Thou Shalt Not Kill',The Avenging Conscience: or 'Thou Shalt Not Kill',1914,1914-08-24,"Crime, Drama, Horror",78,USA,English,D.W. Griffith,"Edgar Allan Poe, D.W. Griffith",Majestic Motion Picture Company,"Henry B. Walthall, Spottiswoode Aitken, Blanch...",Prevented from dating his sweetheart by his un...,6.5,1050,4.784118e+06,5548375.0
21,tt0003657,The Bargain,The Bargain,1914,1914-12-03,Western,70,USA,English,Reginald Barker,"William H. Clifford, Thomas H. Ince",New York Motion Picture,"William S. Hart, J. Frank Burke, Clara William...",After the bandit Jim Stokes robs the stage he ...,6.6,121,4.784118e+06,5548375.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81227,tt9783778,Adventures of Aladdin,Adventures of Aladdin,2019,2019-05-13,"Adventure, Fantasy",87,USA,English,Glenn Campbell,"Glenn Campbell, Tammy Klein",The Asylum,"Adam Hollick, Daniel O'Reilly, Lucia Dimitra X...","With the help of a magical lamp, an impoverish...",1.7,998,4.784118e+06,5548375.0
81239,tt9815714,The Hard Way,The Hard Way,2019,2019-03-05,Action,92,USA,English,Keoni Waxman,"Thomas J. Churchill, Keoni Waxman",Actionhouse Pictures,"Michael Jai White, Luke Goss, Randy Couture, M...",After learning his brother died on a mission i...,4.8,1734,3.000000e+06,5548375.0
81246,tt9831136,The Banana Splits Movie,The Banana Splits Movie,2019,2019-08-13,"Comedy, Horror, Sci-Fi",89,USA,English,Danishka Esterhazy,"Jed Elinoff, Scott Thomas",Blue Ice Pictures,"Dani Kind, Steve Lund, Celina Martin, Finlay W...",A boy named Harley and his family (brother Aus...,5.0,1691,4.784118e+06,5548375.0
81254,tt9860860,Abduction 101,Abduction 101,2019,2019-01-01,Horror,77,USA,English,"Robin Entreinger, Steve Noir","Robin Entreinger, Steve Noir",Seven Light,"Luna Labelle, Nixi Oblivion, Brianna Shewbert ...",Three beautiful women find a strange house in ...,2.4,146,5.000000e+05,5548375.0


In [45]:
df

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,writer,production_company,actors,description,avg_vote,votes,budget,usa_gross_income
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,4.500000e+04,5548375.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,4.784118e+06,5548375.0
15,tt0003167,"Home, Sweet Home","Home, Sweet Home",1914,1914-05-17,Drama,55,USA,English,D.W. Griffith,"D.W. Griffith, H.E. Aitken",Majestic Motion Picture Company,"Henry B. Walthall, Josephine Crowell, Lillian ...",John Howard Payne at his most miserable point ...,5.7,170,4.784118e+06,5548375.0
20,tt0003643,The Avenging Conscience: or 'Thou Shalt Not Kill',The Avenging Conscience: or 'Thou Shalt Not Kill',1914,1914-08-24,"Crime, Drama, Horror",78,USA,English,D.W. Griffith,"Edgar Allan Poe, D.W. Griffith",Majestic Motion Picture Company,"Henry B. Walthall, Spottiswoode Aitken, Blanch...",Prevented from dating his sweetheart by his un...,6.5,1050,4.784118e+06,5548375.0
21,tt0003657,The Bargain,The Bargain,1914,1914-12-03,Western,70,USA,English,Reginald Barker,"William H. Clifford, Thomas H. Ince",New York Motion Picture,"William S. Hart, J. Frank Burke, Clara William...",After the bandit Jim Stokes robs the stage he ...,6.6,121,4.784118e+06,5548375.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81227,tt9783778,Adventures of Aladdin,Adventures of Aladdin,2019,2019-05-13,"Adventure, Fantasy",87,USA,English,Glenn Campbell,"Glenn Campbell, Tammy Klein",The Asylum,"Adam Hollick, Daniel O'Reilly, Lucia Dimitra X...","With the help of a magical lamp, an impoverish...",1.7,998,4.784118e+06,5548375.0
81239,tt9815714,The Hard Way,The Hard Way,2019,2019-03-05,Action,92,USA,English,Keoni Waxman,"Thomas J. Churchill, Keoni Waxman",Actionhouse Pictures,"Michael Jai White, Luke Goss, Randy Couture, M...",After learning his brother died on a mission i...,4.8,1734,3.000000e+06,5548375.0
81246,tt9831136,The Banana Splits Movie,The Banana Splits Movie,2019,2019-08-13,"Comedy, Horror, Sci-Fi",89,USA,English,Danishka Esterhazy,"Jed Elinoff, Scott Thomas",Blue Ice Pictures,"Dani Kind, Steve Lund, Celina Martin, Finlay W...",A boy named Harley and his family (brother Aus...,5.0,1691,4.784118e+06,5548375.0
81254,tt9860860,Abduction 101,Abduction 101,2019,2019-01-01,Horror,77,USA,English,"Robin Entreinger, Steve Noir","Robin Entreinger, Steve Noir",Seven Light,"Luna Labelle, Nixi Oblivion, Brianna Shewbert ...",Three beautiful women find a strange house in ...,2.4,146,5.000000e+05,5548375.0
